compiler.odin 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. package regex_compiler
  2. /*
  3. (c) Copyright 2024 Feoramund <[email protected]>.
  4. Made available under Odin's BSD-3 license.
  5. List of contributors:
  6. Feoramund: Initial implementation.
  7. */
  8. import "base:intrinsics"
  9. import "core:text/regex/common"
  10. import "core:text/regex/parser"
  11. import "core:text/regex/tokenizer"
  12. import "core:text/regex/virtual_machine"
  13. import "core:unicode"
  14. Token :: tokenizer.Token
  15. Token_Kind :: tokenizer.Token_Kind
  16. Tokenizer :: tokenizer.Tokenizer
  17. Rune_Class_Range :: parser.Rune_Class_Range
  18. Rune_Class_Data :: parser.Rune_Class_Data
  19. Node :: parser.Node
  20. Node_Rune :: parser.Node_Rune
  21. Node_Rune_Class :: parser.Node_Rune_Class
  22. Node_Wildcard :: parser.Node_Wildcard
  23. Node_Concatenation :: parser.Node_Concatenation
  24. Node_Alternation :: parser.Node_Alternation
  25. Node_Repeat_Zero :: parser.Node_Repeat_Zero
  26. Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
  27. Node_Repeat_One :: parser.Node_Repeat_One
  28. Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
  29. Node_Repeat_N :: parser.Node_Repeat_N
  30. Node_Optional :: parser.Node_Optional
  31. Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
  32. Node_Group :: parser.Node_Group
  33. Node_Anchor :: parser.Node_Anchor
  34. Node_Word_Boundary :: parser.Node_Word_Boundary
  35. Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
  36. Opcode :: virtual_machine.Opcode
  37. Program :: [dynamic]Opcode
  38. JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
  39. SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
  40. Compiler :: struct {
  41. flags: common.Flags,
  42. class_data: [dynamic]Rune_Class_Data,
  43. }
  44. Error :: enum {
  45. None,
  46. Program_Too_Big,
  47. Too_Many_Classes,
  48. }
  49. classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
  50. assert(q != nil)
  51. assert(w != nil)
  52. if q == w {
  53. return true
  54. }
  55. if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
  56. return false
  57. }
  58. for r, i in q.runes {
  59. if r != w.runes[i] {
  60. return false
  61. }
  62. }
  63. for r, i in q.ranges {
  64. if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
  65. return false
  66. }
  67. }
  68. return true
  69. }
  70. map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
  71. if tree == nil {
  72. return
  73. }
  74. switch specific in tree {
  75. case ^Node_Rune: break
  76. case ^Node_Wildcard: break
  77. case ^Node_Anchor: break
  78. case ^Node_Word_Boundary: break
  79. case ^Node_Match_All_And_Escape: break
  80. case ^Node_Concatenation:
  81. for subnode in specific.nodes {
  82. map_all_classes(subnode, collection)
  83. }
  84. case ^Node_Repeat_Zero:
  85. map_all_classes(specific.inner, collection)
  86. case ^Node_Repeat_Zero_Non_Greedy:
  87. map_all_classes(specific.inner, collection)
  88. case ^Node_Repeat_One:
  89. map_all_classes(specific.inner, collection)
  90. case ^Node_Repeat_One_Non_Greedy:
  91. map_all_classes(specific.inner, collection)
  92. case ^Node_Repeat_N:
  93. map_all_classes(specific.inner, collection)
  94. case ^Node_Optional:
  95. map_all_classes(specific.inner, collection)
  96. case ^Node_Optional_Non_Greedy:
  97. map_all_classes(specific.inner, collection)
  98. case ^Node_Group:
  99. map_all_classes(specific.inner, collection)
  100. case ^Node_Alternation:
  101. map_all_classes(specific.left, collection)
  102. map_all_classes(specific.right, collection)
  103. case ^Node_Rune_Class:
  104. unseen := true
  105. for &value in collection {
  106. if classes_are_exact(&specific.data, &value) {
  107. unseen = false
  108. break
  109. }
  110. }
  111. if unseen {
  112. append(collection, specific.data)
  113. }
  114. }
  115. }
  116. append_raw :: #force_inline proc(code: ^Program, data: $T) {
  117. // NOTE: This is system-dependent endian.
  118. for b in transmute([size_of(T)]byte)data {
  119. append(code, cast(Opcode)b)
  120. }
  121. }
  122. inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
  123. // NOTE: This is system-dependent endian.
  124. for b, i in transmute([size_of(T)]byte)data {
  125. inject_at(code, start + i, cast(Opcode)b)
  126. }
  127. }
  128. @require_results
  129. generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
  130. if node == nil {
  131. return
  132. }
  133. // NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
  134. // this later when relative jumps are turned into absolute jumps.
  135. switch specific in node {
  136. // Atomic Nodes:
  137. case ^Node_Rune:
  138. if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
  139. append(&code, Opcode.Byte)
  140. append(&code, cast(Opcode)specific.data)
  141. } else {
  142. append(&code, Opcode.Rune)
  143. append_raw(&code, specific.data)
  144. }
  145. case ^Node_Rune_Class:
  146. if specific.negating {
  147. append(&code, Opcode.Rune_Class_Negated)
  148. } else {
  149. append(&code, Opcode.Rune_Class)
  150. }
  151. index := -1
  152. for &data, i in c.class_data {
  153. if classes_are_exact(&data, &specific.data) {
  154. index = i
  155. break
  156. }
  157. }
  158. assert(index != -1, "Unable to find collected Rune_Class_Data index.")
  159. append(&code, Opcode(index))
  160. case ^Node_Wildcard:
  161. append(&code, Opcode.Wildcard)
  162. case ^Node_Anchor:
  163. if .Multiline in c.flags {
  164. append(&code, Opcode.Multiline_Open)
  165. append(&code, Opcode.Multiline_Close)
  166. } else {
  167. if specific.start {
  168. append(&code, Opcode.Assert_Start)
  169. } else {
  170. append(&code, Opcode.Assert_End)
  171. }
  172. }
  173. case ^Node_Word_Boundary:
  174. if specific.non_word {
  175. append(&code, Opcode.Assert_Non_Word_Boundary)
  176. } else {
  177. append(&code, Opcode.Assert_Word_Boundary)
  178. }
  179. // Compound Nodes:
  180. case ^Node_Group:
  181. code = generate_code(c, specific.inner)
  182. if specific.capture && .No_Capture not_in c.flags {
  183. inject_at(&code, 0, Opcode.Save)
  184. inject_at(&code, 1, Opcode(2 * specific.capture_id))
  185. append(&code, Opcode.Save)
  186. append(&code, Opcode(2 * specific.capture_id + 1))
  187. }
  188. case ^Node_Alternation:
  189. left := generate_code(c, specific.left)
  190. right := generate_code(c, specific.right)
  191. left_len := len(left)
  192. // Avoiding duplicate allocation by reusing `left`.
  193. code = left
  194. inject_at(&code, 0, Opcode.Split)
  195. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  196. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
  197. append(&code, Opcode.Jump)
  198. append_raw(&code, i16(len(right) + JUMP_SIZE))
  199. for opcode in right {
  200. append(&code, opcode)
  201. }
  202. case ^Node_Concatenation:
  203. for subnode in specific.nodes {
  204. subnode_code := generate_code(c, subnode)
  205. for opcode in subnode_code {
  206. append(&code, opcode)
  207. }
  208. }
  209. case ^Node_Repeat_Zero:
  210. code = generate_code(c, specific.inner)
  211. original_len := len(code)
  212. inject_at(&code, 0, Opcode.Split)
  213. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  214. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  215. append(&code, Opcode.Jump)
  216. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  217. case ^Node_Repeat_Zero_Non_Greedy:
  218. code = generate_code(c, specific.inner)
  219. original_len := len(code)
  220. inject_at(&code, 0, Opcode.Split)
  221. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  222. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  223. append(&code, Opcode.Jump)
  224. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  225. case ^Node_Repeat_One:
  226. code = generate_code(c, specific.inner)
  227. original_len := len(code)
  228. append(&code, Opcode.Split)
  229. append_raw(&code, i16(-original_len))
  230. append_raw(&code, i16(SPLIT_SIZE))
  231. case ^Node_Repeat_One_Non_Greedy:
  232. code = generate_code(c, specific.inner)
  233. original_len := len(code)
  234. append(&code, Opcode.Split)
  235. append_raw(&code, i16(SPLIT_SIZE))
  236. append_raw(&code, i16(-original_len))
  237. case ^Node_Repeat_N:
  238. inside := generate_code(c, specific.inner)
  239. original_len := len(inside)
  240. if specific.lower == specific.upper { // {N}
  241. // e{N} ... evaluates to ... e^N
  242. for i := 0; i < specific.upper; i += 1 {
  243. for opcode in inside {
  244. append(&code, opcode)
  245. }
  246. }
  247. } else if specific.lower == -1 && specific.upper > 0 { // {,M}
  248. // e{,M} ... evaluates to ... e?^M
  249. for i := 0; i < specific.upper; i += 1 {
  250. append(&code, Opcode.Split)
  251. append_raw(&code, i16(SPLIT_SIZE))
  252. append_raw(&code, i16(SPLIT_SIZE + original_len))
  253. for opcode in inside {
  254. append(&code, opcode)
  255. }
  256. }
  257. } else if specific.lower >= 0 && specific.upper == -1 { // {N,}
  258. // e{N,} ... evaluates to ... e^N e*
  259. for i := 0; i < specific.lower; i += 1 {
  260. for opcode in inside {
  261. append(&code, opcode)
  262. }
  263. }
  264. append(&code, Opcode.Split)
  265. append_raw(&code, i16(SPLIT_SIZE))
  266. append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  267. for opcode in inside {
  268. append(&code, opcode)
  269. }
  270. append(&code, Opcode.Jump)
  271. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  272. } else if specific.lower >= 0 && specific.upper > 0 {
  273. // e{N,M} evaluates to ... e^N e?^(M-N)
  274. for i := 0; i < specific.lower; i += 1 {
  275. for opcode in inside {
  276. append(&code, opcode)
  277. }
  278. }
  279. for i := 0; i < specific.upper - specific.lower; i += 1 {
  280. append(&code, Opcode.Split)
  281. append_raw(&code, i16(SPLIT_SIZE + original_len))
  282. append_raw(&code, i16(SPLIT_SIZE))
  283. for opcode in inside {
  284. append(&code, opcode)
  285. }
  286. }
  287. } else {
  288. panic("RegEx compiler received invalid repetition group.")
  289. }
  290. case ^Node_Optional:
  291. code = generate_code(c, specific.inner)
  292. original_len := len(code)
  293. inject_at(&code, 0, Opcode.Split)
  294. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  295. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
  296. case ^Node_Optional_Non_Greedy:
  297. code = generate_code(c, specific.inner)
  298. original_len := len(code)
  299. inject_at(&code, 0, Opcode.Split)
  300. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
  301. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  302. case ^Node_Match_All_And_Escape:
  303. append(&code, Opcode.Match_All_And_Escape)
  304. }
  305. return
  306. }
  307. @require_results
  308. compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
  309. if tree == nil {
  310. if .No_Capture not_in flags {
  311. append(&code, Opcode.Save); append(&code, Opcode(0x00))
  312. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  313. append(&code, Opcode.Match)
  314. } else {
  315. append(&code, Opcode.Match_And_Exit)
  316. }
  317. return
  318. }
  319. c: Compiler
  320. c.flags = flags
  321. map_all_classes(tree, &class_data)
  322. if len(class_data) >= common.MAX_CLASSES {
  323. err = .Too_Many_Classes
  324. return
  325. }
  326. c.class_data = class_data
  327. code = generate_code(&c, tree)
  328. pc_open := 0
  329. add_global: if .Global in flags {
  330. // Check if the opening to the pattern is predictable.
  331. // If so, use one of the optimized Wait opcodes.
  332. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  333. seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  334. #partial switch opcode {
  335. case .Byte:
  336. inject_at(&code, pc_open, Opcode.Wait_For_Byte)
  337. pc_open += size_of(Opcode)
  338. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  339. pc_open += size_of(u8)
  340. break add_global
  341. case .Rune:
  342. operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
  343. inject_at(&code, pc_open, Opcode.Wait_For_Rune)
  344. pc_open += size_of(Opcode)
  345. inject_raw(&code, pc_open, operand)
  346. pc_open += size_of(rune)
  347. break add_global
  348. case .Rune_Class:
  349. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
  350. pc_open += size_of(Opcode)
  351. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  352. pc_open += size_of(u8)
  353. break add_global
  354. case .Rune_Class_Negated:
  355. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
  356. pc_open += size_of(Opcode)
  357. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  358. pc_open += size_of(u8)
  359. break add_global
  360. case .Save:
  361. continue
  362. case:
  363. break seek_loop
  364. }
  365. }
  366. // `.*?`
  367. inject_at(&code, pc_open, Opcode.Split)
  368. pc_open += size_of(byte)
  369. inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
  370. pc_open += size_of(i16)
  371. inject_raw(&code, pc_open, i16(SPLIT_SIZE))
  372. pc_open += size_of(i16)
  373. inject_at(&code, pc_open, Opcode.Wildcard)
  374. pc_open += size_of(byte)
  375. inject_at(&code, pc_open, Opcode.Jump)
  376. pc_open += size_of(byte)
  377. inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
  378. pc_open += size_of(i16)
  379. }
  380. if .No_Capture not_in flags {
  381. // `(` <generated code>
  382. inject_at(&code, pc_open, Opcode.Save)
  383. inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
  384. // `)`
  385. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  386. append(&code, Opcode.Match)
  387. } else {
  388. append(&code, Opcode.Match_And_Exit)
  389. }
  390. if len(code) >= common.MAX_PROGRAM_SIZE {
  391. err = .Program_Too_Big
  392. return
  393. }
  394. // NOTE: No further opcode addition beyond this point, as we've already
  395. // checked the program size. Removal or transformation is fine.
  396. // Post-Compile Optimizations:
  397. // * Jump Extension
  398. //
  399. // A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
  400. if .No_Optimization not_in flags {
  401. for passes_left := 1; passes_left > 0; passes_left -= 1 {
  402. do_another_pass := false
  403. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  404. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  405. #partial switch opcode {
  406. case .Jump:
  407. jmp := cast(^i16)&code[pc+size_of(Opcode)]
  408. jmp_value := intrinsics.unaligned_load(jmp)
  409. if code[cast(i16)pc+jmp_value] == .Jump {
  410. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_value+size_of(Opcode)])
  411. intrinsics.unaligned_store(jmp, jmp_value + next_jmp)
  412. do_another_pass = true
  413. }
  414. case .Split:
  415. jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
  416. jmp_x_value := intrinsics.unaligned_load(jmp_x)
  417. if code[cast(i16)pc+jmp_x_value] == .Jump {
  418. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x_value+size_of(Opcode)])
  419. intrinsics.unaligned_store(jmp_x, jmp_x_value + next_jmp)
  420. do_another_pass = true
  421. }
  422. jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
  423. jmp_y_value := intrinsics.unaligned_load(jmp_y)
  424. if code[cast(i16)pc+jmp_y_value] == .Jump {
  425. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y_value+size_of(Opcode)])
  426. intrinsics.unaligned_store(jmp_y, jmp_y_value + next_jmp)
  427. do_another_pass = true
  428. }
  429. }
  430. }
  431. if do_another_pass {
  432. passes_left += 1
  433. }
  434. }
  435. }
  436. // * Relative Jump to Absolute Jump
  437. //
  438. // RelJmp{PC +/- N} => AbsJmp{M}
  439. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  440. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  441. // NOTE: The virtual machine implementation depends on this.
  442. #partial switch opcode {
  443. case .Jump:
  444. jmp := cast(^u16)&code[pc+size_of(Opcode)]
  445. intrinsics.unaligned_store(jmp, intrinsics.unaligned_load(jmp) + cast(u16)pc)
  446. case .Split:
  447. jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
  448. intrinsics.unaligned_store(jmp_x, intrinsics.unaligned_load(jmp_x) + cast(u16)pc)
  449. jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
  450. intrinsics.unaligned_store(jmp_y, intrinsics.unaligned_load(jmp_y) + cast(u16)pc)
  451. }
  452. }
  453. return
  454. }