compiler.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538
  1. package regex_compiler
  2. import "core:text/regex/common"
  3. import "core:text/regex/parser"
  4. import "core:text/regex/tokenizer"
  5. import "core:text/regex/virtual_machine"
  6. import "core:unicode"
  7. Token :: tokenizer.Token
  8. Token_Kind :: tokenizer.Token_Kind
  9. Tokenizer :: tokenizer.Tokenizer
  10. Rune_Class_Range :: parser.Rune_Class_Range
  11. Rune_Class_Data :: parser.Rune_Class_Data
  12. Node :: parser.Node
  13. Node_Rune :: parser.Node_Rune
  14. Node_Rune_Class :: parser.Node_Rune_Class
  15. Node_Wildcard :: parser.Node_Wildcard
  16. Node_Concatenation :: parser.Node_Concatenation
  17. Node_Alternation :: parser.Node_Alternation
  18. Node_Repeat_Zero :: parser.Node_Repeat_Zero
  19. Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
  20. Node_Repeat_One :: parser.Node_Repeat_One
  21. Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
  22. Node_Repeat_N :: parser.Node_Repeat_N
  23. Node_Optional :: parser.Node_Optional
  24. Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
  25. Node_Group :: parser.Node_Group
  26. Node_Anchor :: parser.Node_Anchor
  27. Node_Word_Boundary :: parser.Node_Word_Boundary
  28. Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
  29. Opcode :: virtual_machine.Opcode
  30. Program :: [dynamic]Opcode
  31. JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
  32. SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
  33. Compiler :: struct {
  34. flags: common.Flags,
  35. anchor_start_seen: bool,
  36. class_data: [dynamic]Rune_Class_Data,
  37. }
  38. Error :: enum {
  39. None,
  40. Program_Too_Big,
  41. Too_Many_Classes,
  42. }
  43. classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
  44. assert(q != nil)
  45. assert(w != nil)
  46. if q == w {
  47. return true
  48. }
  49. if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
  50. return false
  51. }
  52. for r, i in q.runes {
  53. if r != w.runes[i] {
  54. return false
  55. }
  56. }
  57. for r, i in q.ranges {
  58. if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
  59. return false
  60. }
  61. }
  62. return true
  63. }
  64. map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
  65. if tree == nil {
  66. return
  67. }
  68. switch specific in tree {
  69. case ^Node_Rune: break
  70. case ^Node_Wildcard: break
  71. case ^Node_Anchor: break
  72. case ^Node_Word_Boundary: break
  73. case ^Node_Match_All_And_Escape: break
  74. case ^Node_Concatenation:
  75. for subnode in specific.nodes {
  76. map_all_classes(subnode, collection)
  77. }
  78. case ^Node_Repeat_Zero:
  79. map_all_classes(specific.inner, collection)
  80. case ^Node_Repeat_Zero_Non_Greedy:
  81. map_all_classes(specific.inner, collection)
  82. case ^Node_Repeat_One:
  83. map_all_classes(specific.inner, collection)
  84. case ^Node_Repeat_One_Non_Greedy:
  85. map_all_classes(specific.inner, collection)
  86. case ^Node_Repeat_N:
  87. map_all_classes(specific.inner, collection)
  88. case ^Node_Optional:
  89. map_all_classes(specific.inner, collection)
  90. case ^Node_Optional_Non_Greedy:
  91. map_all_classes(specific.inner, collection)
  92. case ^Node_Group:
  93. map_all_classes(specific.inner, collection)
  94. case ^Node_Alternation:
  95. map_all_classes(specific.left, collection)
  96. map_all_classes(specific.right, collection)
  97. case ^Node_Rune_Class:
  98. unseen := true
  99. for &value in collection {
  100. if classes_are_exact(&specific.data, &value) {
  101. unseen = false
  102. break
  103. }
  104. }
  105. if unseen {
  106. append(collection, specific.data)
  107. }
  108. }
  109. }
  110. append_raw :: #force_inline proc(code: ^Program, data: $T) {
  111. // NOTE: This is system-dependent endian.
  112. for b in transmute([size_of(T)]byte)data {
  113. append(code, cast(Opcode)b)
  114. }
  115. }
  116. inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
  117. // NOTE: This is system-dependent endian.
  118. for b, i in transmute([size_of(T)]byte)data {
  119. inject_at(code, start + i, cast(Opcode)b)
  120. }
  121. }
  122. @require_results
  123. generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
  124. if node == nil {
  125. return
  126. }
  127. // NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
  128. // this later when relative jumps are turned into absolute jumps.
  129. switch specific in node {
  130. // Atomic Nodes:
  131. case ^Node_Rune:
  132. if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
  133. append(&code, Opcode.Byte)
  134. append(&code, cast(Opcode)specific.data)
  135. } else {
  136. append(&code, Opcode.Rune)
  137. append_raw(&code, specific.data)
  138. }
  139. case ^Node_Rune_Class:
  140. if specific.negating {
  141. append(&code, Opcode.Rune_Class_Negated)
  142. } else {
  143. append(&code, Opcode.Rune_Class)
  144. }
  145. index := -1
  146. for &data, i in c.class_data {
  147. if classes_are_exact(&data, &specific.data) {
  148. index = i
  149. break
  150. }
  151. }
  152. assert(index != -1, "Unable to find collected Rune_Class_Data index.")
  153. append(&code, Opcode(index))
  154. case ^Node_Wildcard:
  155. append(&code, Opcode.Wildcard)
  156. case ^Node_Anchor:
  157. if .Multiline in c.flags {
  158. append(&code, Opcode.Multiline_Open)
  159. append(&code, Opcode.Multiline_Close)
  160. } else {
  161. if specific.start {
  162. c.anchor_start_seen = true
  163. append(&code, Opcode.Assert_Start)
  164. } else {
  165. append(&code, Opcode.Assert_End)
  166. }
  167. }
  168. case ^Node_Word_Boundary:
  169. if specific.non_word {
  170. append(&code, Opcode.Assert_Non_Word_Boundary)
  171. } else {
  172. append(&code, Opcode.Assert_Word_Boundary)
  173. }
  174. // Compound Nodes:
  175. case ^Node_Group:
  176. code = generate_code(c, specific.inner)
  177. if specific.capture && .No_Capture not_in c.flags {
  178. inject_at(&code, 0, Opcode.Save)
  179. inject_at(&code, 1, Opcode(2 * specific.capture_id))
  180. append(&code, Opcode.Save)
  181. append(&code, Opcode(2 * specific.capture_id + 1))
  182. }
  183. case ^Node_Alternation:
  184. left := generate_code(c, specific.left)
  185. right := generate_code(c, specific.right)
  186. left_len := len(left)
  187. // Avoiding duplicate allocation by reusing `left`.
  188. code = left
  189. inject_at(&code, 0, Opcode.Split)
  190. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  191. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
  192. append(&code, Opcode.Jump)
  193. append_raw(&code, i16(len(right) + JUMP_SIZE))
  194. for opcode in right {
  195. append(&code, opcode)
  196. }
  197. case ^Node_Concatenation:
  198. for subnode in specific.nodes {
  199. subnode_code := generate_code(c, subnode)
  200. for opcode in subnode_code {
  201. append(&code, opcode)
  202. }
  203. }
  204. case ^Node_Repeat_Zero:
  205. code = generate_code(c, specific.inner)
  206. original_len := len(code)
  207. inject_at(&code, 0, Opcode.Split)
  208. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  209. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  210. append(&code, Opcode.Jump)
  211. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  212. case ^Node_Repeat_Zero_Non_Greedy:
  213. code = generate_code(c, specific.inner)
  214. original_len := len(code)
  215. inject_at(&code, 0, Opcode.Split)
  216. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  217. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  218. append(&code, Opcode.Jump)
  219. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  220. case ^Node_Repeat_One:
  221. code = generate_code(c, specific.inner)
  222. original_len := len(code)
  223. append(&code, Opcode.Split)
  224. append_raw(&code, i16(-original_len))
  225. append_raw(&code, i16(SPLIT_SIZE))
  226. case ^Node_Repeat_One_Non_Greedy:
  227. code = generate_code(c, specific.inner)
  228. original_len := len(code)
  229. append(&code, Opcode.Split)
  230. append_raw(&code, i16(SPLIT_SIZE))
  231. append_raw(&code, i16(-original_len))
  232. case ^Node_Repeat_N:
  233. inside := generate_code(c, specific.inner)
  234. original_len := len(inside)
  235. if specific.lower == specific.upper { // {N}
  236. // e{N} ... evaluates to ... e^N
  237. for i := 0; i < specific.upper; i += 1 {
  238. for opcode in inside {
  239. append(&code, opcode)
  240. }
  241. }
  242. } else if specific.lower == -1 && specific.upper > 0 { // {,M}
  243. // e{,M} ... evaluates to ... e?^M
  244. for i := 0; i < specific.upper; i += 1 {
  245. append(&code, Opcode.Split)
  246. append_raw(&code, i16(SPLIT_SIZE))
  247. append_raw(&code, i16(SPLIT_SIZE + original_len))
  248. for opcode in inside {
  249. append(&code, opcode)
  250. }
  251. }
  252. } else if specific.lower >= 0 && specific.upper == -1 { // {N,}
  253. // e{N,} ... evaluates to ... e^N e*
  254. for i := 0; i < specific.lower; i += 1 {
  255. for opcode in inside {
  256. append(&code, opcode)
  257. }
  258. }
  259. append(&code, Opcode.Split)
  260. append_raw(&code, i16(SPLIT_SIZE))
  261. append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  262. for opcode in inside {
  263. append(&code, opcode)
  264. }
  265. append(&code, Opcode.Jump)
  266. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  267. } else if specific.lower >= 0 && specific.upper > 0 {
  268. // e{N,M} evaluates to ... e^N e?^(M-N)
  269. for i := 0; i < specific.lower; i += 1 {
  270. for opcode in inside {
  271. append(&code, opcode)
  272. }
  273. }
  274. for i := 0; i < specific.upper - specific.lower; i += 1 {
  275. append(&code, Opcode.Split)
  276. append_raw(&code, i16(SPLIT_SIZE + original_len))
  277. append_raw(&code, i16(SPLIT_SIZE))
  278. for opcode in inside {
  279. append(&code, opcode)
  280. }
  281. }
  282. } else {
  283. panic("RegEx compiler received invalid repetition group.")
  284. }
  285. case ^Node_Optional:
  286. code = generate_code(c, specific.inner)
  287. original_len := len(code)
  288. inject_at(&code, 0, Opcode.Split)
  289. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  290. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
  291. case ^Node_Optional_Non_Greedy:
  292. code = generate_code(c, specific.inner)
  293. original_len := len(code)
  294. inject_at(&code, 0, Opcode.Split)
  295. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
  296. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  297. case ^Node_Match_All_And_Escape:
  298. append(&code, Opcode.Match_All_And_Escape)
  299. }
  300. return
  301. }
  302. @require_results
  303. compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
  304. if tree == nil {
  305. if .No_Capture not_in flags {
  306. append(&code, Opcode.Save); append(&code, Opcode(0x00))
  307. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  308. append(&code, Opcode.Match)
  309. } else {
  310. append(&code, Opcode.Match_And_Exit)
  311. }
  312. return
  313. }
  314. c: Compiler
  315. c.flags = flags
  316. map_all_classes(tree, &class_data)
  317. if len(class_data) >= common.MAX_CLASSES {
  318. err = .Too_Many_Classes
  319. return
  320. }
  321. c.class_data = class_data
  322. code = generate_code(&c, tree)
  323. pc_open := 0
  324. add_global: if .Global in flags {
  325. // Check if the opening to the pattern is predictable.
  326. // If so, use one of the optimized Wait opcodes.
  327. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  328. seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  329. #partial switch opcode {
  330. case .Byte:
  331. inject_at(&code, pc_open, Opcode.Wait_For_Byte)
  332. pc_open += size_of(Opcode)
  333. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  334. pc_open += size_of(u8)
  335. break add_global
  336. case .Rune:
  337. operand := (cast(^rune)&code[pc+1])^
  338. inject_at(&code, pc_open, Opcode.Wait_For_Rune)
  339. pc_open += size_of(Opcode)
  340. inject_raw(&code, pc_open, operand)
  341. pc_open += size_of(rune)
  342. break add_global
  343. case .Rune_Class:
  344. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
  345. pc_open += size_of(Opcode)
  346. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  347. pc_open += size_of(u8)
  348. break add_global
  349. case .Rune_Class_Negated:
  350. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
  351. pc_open += size_of(Opcode)
  352. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  353. pc_open += size_of(u8)
  354. break add_global
  355. case .Save:
  356. continue
  357. case:
  358. break seek_loop
  359. }
  360. }
  361. // `.*?`
  362. inject_at(&code, pc_open, Opcode.Split)
  363. pc_open += size_of(byte)
  364. inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
  365. pc_open += size_of(i16)
  366. inject_raw(&code, pc_open, i16(SPLIT_SIZE))
  367. pc_open += size_of(i16)
  368. inject_at(&code, pc_open, Opcode.Wildcard)
  369. pc_open += size_of(byte)
  370. inject_at(&code, pc_open, Opcode.Jump)
  371. pc_open += size_of(byte)
  372. inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
  373. pc_open += size_of(i16)
  374. }
  375. if .No_Capture not_in flags {
  376. // `(` <generated code>
  377. inject_at(&code, pc_open, Opcode.Save)
  378. inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
  379. // `)`
  380. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  381. append(&code, Opcode.Match)
  382. } else {
  383. append(&code, Opcode.Match_And_Exit)
  384. }
  385. if len(code) >= common.MAX_PROGRAM_SIZE {
  386. err = .Program_Too_Big
  387. return
  388. }
  389. // NOTE: No further opcode addition beyond this point, as we've already
  390. // checked the program size. Removal or transformation is fine.
  391. // Post-Compile Optimizations:
  392. // * Jump Extension
  393. //
  394. // A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
  395. if .No_Optimization not_in flags {
  396. for passes_left := 1; passes_left > 0; passes_left -= 1 {
  397. do_another_pass := false
  398. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  399. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  400. #partial switch opcode {
  401. case .Jump:
  402. jmp := cast(^i16)&code[pc+size_of(Opcode)]
  403. if code[cast(i16)pc+jmp^] == .Jump {
  404. next_jmp := (cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)])^
  405. jmp^ = jmp^ + next_jmp
  406. do_another_pass = true
  407. }
  408. case .Split:
  409. jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
  410. if code[cast(i16)pc+jmp_x^] == .Jump {
  411. next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)])^
  412. jmp_x^ = jmp_x^ + next_jmp
  413. do_another_pass = true
  414. }
  415. jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
  416. if code[cast(i16)pc+jmp_y^] == .Jump {
  417. next_jmp := (cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)])^
  418. jmp_y^ = jmp_y^ + next_jmp
  419. do_another_pass = true
  420. }
  421. }
  422. }
  423. if do_another_pass {
  424. passes_left += 1
  425. }
  426. }
  427. }
  428. // * Relative Jump to Absolute Jump
  429. //
  430. // RelJmp{PC +/- N} => AbsJmp{M}
  431. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  432. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  433. // NOTE: The virtual machine implementation depends on this.
  434. #partial switch opcode {
  435. case .Jump:
  436. jmp := cast(^u16)&code[pc+size_of(Opcode)]
  437. jmp^ = jmp^ + cast(u16)pc
  438. case .Split:
  439. jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
  440. jmp_x^ = jmp_x^ + cast(u16)pc
  441. jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
  442. jmp_y^ = jmp_y^ + cast(u16)pc
  443. }
  444. }
  445. return
  446. }