compiler.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
  1. package regex_compiler
  2. import "base:intrinsics"
  3. import "core:text/regex/common"
  4. import "core:text/regex/parser"
  5. import "core:text/regex/tokenizer"
  6. import "core:text/regex/virtual_machine"
  7. import "core:unicode"
  8. Token :: tokenizer.Token
  9. Token_Kind :: tokenizer.Token_Kind
  10. Tokenizer :: tokenizer.Tokenizer
  11. Rune_Class_Range :: parser.Rune_Class_Range
  12. Rune_Class_Data :: parser.Rune_Class_Data
  13. Node :: parser.Node
  14. Node_Rune :: parser.Node_Rune
  15. Node_Rune_Class :: parser.Node_Rune_Class
  16. Node_Wildcard :: parser.Node_Wildcard
  17. Node_Concatenation :: parser.Node_Concatenation
  18. Node_Alternation :: parser.Node_Alternation
  19. Node_Repeat_Zero :: parser.Node_Repeat_Zero
  20. Node_Repeat_Zero_Non_Greedy :: parser.Node_Repeat_Zero_Non_Greedy
  21. Node_Repeat_One :: parser.Node_Repeat_One
  22. Node_Repeat_One_Non_Greedy :: parser.Node_Repeat_One_Non_Greedy
  23. Node_Repeat_N :: parser.Node_Repeat_N
  24. Node_Optional :: parser.Node_Optional
  25. Node_Optional_Non_Greedy :: parser.Node_Optional_Non_Greedy
  26. Node_Group :: parser.Node_Group
  27. Node_Anchor :: parser.Node_Anchor
  28. Node_Word_Boundary :: parser.Node_Word_Boundary
  29. Node_Match_All_And_Escape :: parser.Node_Match_All_And_Escape
  30. Opcode :: virtual_machine.Opcode
  31. Program :: [dynamic]Opcode
  32. JUMP_SIZE :: size_of(Opcode) + 1 * size_of(u16)
  33. SPLIT_SIZE :: size_of(Opcode) + 2 * size_of(u16)
  34. Compiler :: struct {
  35. flags: common.Flags,
  36. anchor_start_seen: bool,
  37. class_data: [dynamic]Rune_Class_Data,
  38. }
  39. Error :: enum {
  40. None,
  41. Program_Too_Big,
  42. Too_Many_Classes,
  43. }
  44. classes_are_exact :: proc(q, w: ^Rune_Class_Data) -> bool #no_bounds_check {
  45. assert(q != nil)
  46. assert(w != nil)
  47. if q == w {
  48. return true
  49. }
  50. if len(q.runes) != len(w.runes) || len(q.ranges) != len(w.ranges) {
  51. return false
  52. }
  53. for r, i in q.runes {
  54. if r != w.runes[i] {
  55. return false
  56. }
  57. }
  58. for r, i in q.ranges {
  59. if r.lower != w.ranges[i].lower || r.upper != w.ranges[i].upper {
  60. return false
  61. }
  62. }
  63. return true
  64. }
  65. map_all_classes :: proc(tree: Node, collection: ^[dynamic]Rune_Class_Data) {
  66. if tree == nil {
  67. return
  68. }
  69. switch specific in tree {
  70. case ^Node_Rune: break
  71. case ^Node_Wildcard: break
  72. case ^Node_Anchor: break
  73. case ^Node_Word_Boundary: break
  74. case ^Node_Match_All_And_Escape: break
  75. case ^Node_Concatenation:
  76. for subnode in specific.nodes {
  77. map_all_classes(subnode, collection)
  78. }
  79. case ^Node_Repeat_Zero:
  80. map_all_classes(specific.inner, collection)
  81. case ^Node_Repeat_Zero_Non_Greedy:
  82. map_all_classes(specific.inner, collection)
  83. case ^Node_Repeat_One:
  84. map_all_classes(specific.inner, collection)
  85. case ^Node_Repeat_One_Non_Greedy:
  86. map_all_classes(specific.inner, collection)
  87. case ^Node_Repeat_N:
  88. map_all_classes(specific.inner, collection)
  89. case ^Node_Optional:
  90. map_all_classes(specific.inner, collection)
  91. case ^Node_Optional_Non_Greedy:
  92. map_all_classes(specific.inner, collection)
  93. case ^Node_Group:
  94. map_all_classes(specific.inner, collection)
  95. case ^Node_Alternation:
  96. map_all_classes(specific.left, collection)
  97. map_all_classes(specific.right, collection)
  98. case ^Node_Rune_Class:
  99. unseen := true
  100. for &value in collection {
  101. if classes_are_exact(&specific.data, &value) {
  102. unseen = false
  103. break
  104. }
  105. }
  106. if unseen {
  107. append(collection, specific.data)
  108. }
  109. }
  110. }
  111. append_raw :: #force_inline proc(code: ^Program, data: $T) {
  112. // NOTE: This is system-dependent endian.
  113. for b in transmute([size_of(T)]byte)data {
  114. append(code, cast(Opcode)b)
  115. }
  116. }
  117. inject_raw :: #force_inline proc(code: ^Program, start: int, data: $T) {
  118. // NOTE: This is system-dependent endian.
  119. for b, i in transmute([size_of(T)]byte)data {
  120. inject_at(code, start + i, cast(Opcode)b)
  121. }
  122. }
  123. @require_results
  124. generate_code :: proc(c: ^Compiler, node: Node) -> (code: Program) {
  125. if node == nil {
  126. return
  127. }
  128. // NOTE: For Jump/Split arguments, we write as i16 and will reinterpret
  129. // this later when relative jumps are turned into absolute jumps.
  130. switch specific in node {
  131. // Atomic Nodes:
  132. case ^Node_Rune:
  133. if .Unicode not_in c.flags || specific.data < unicode.MAX_LATIN1 {
  134. append(&code, Opcode.Byte)
  135. append(&code, cast(Opcode)specific.data)
  136. } else {
  137. append(&code, Opcode.Rune)
  138. append_raw(&code, specific.data)
  139. }
  140. case ^Node_Rune_Class:
  141. if specific.negating {
  142. append(&code, Opcode.Rune_Class_Negated)
  143. } else {
  144. append(&code, Opcode.Rune_Class)
  145. }
  146. index := -1
  147. for &data, i in c.class_data {
  148. if classes_are_exact(&data, &specific.data) {
  149. index = i
  150. break
  151. }
  152. }
  153. assert(index != -1, "Unable to find collected Rune_Class_Data index.")
  154. append(&code, Opcode(index))
  155. case ^Node_Wildcard:
  156. append(&code, Opcode.Wildcard)
  157. case ^Node_Anchor:
  158. if .Multiline in c.flags {
  159. append(&code, Opcode.Multiline_Open)
  160. append(&code, Opcode.Multiline_Close)
  161. } else {
  162. if specific.start {
  163. c.anchor_start_seen = true
  164. append(&code, Opcode.Assert_Start)
  165. } else {
  166. append(&code, Opcode.Assert_End)
  167. }
  168. }
  169. case ^Node_Word_Boundary:
  170. if specific.non_word {
  171. append(&code, Opcode.Assert_Non_Word_Boundary)
  172. } else {
  173. append(&code, Opcode.Assert_Word_Boundary)
  174. }
  175. // Compound Nodes:
  176. case ^Node_Group:
  177. code = generate_code(c, specific.inner)
  178. if specific.capture && .No_Capture not_in c.flags {
  179. inject_at(&code, 0, Opcode.Save)
  180. inject_at(&code, 1, Opcode(2 * specific.capture_id))
  181. append(&code, Opcode.Save)
  182. append(&code, Opcode(2 * specific.capture_id + 1))
  183. }
  184. case ^Node_Alternation:
  185. left := generate_code(c, specific.left)
  186. right := generate_code(c, specific.right)
  187. left_len := len(left)
  188. // Avoiding duplicate allocation by reusing `left`.
  189. code = left
  190. inject_at(&code, 0, Opcode.Split)
  191. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  192. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + left_len + JUMP_SIZE))
  193. append(&code, Opcode.Jump)
  194. append_raw(&code, i16(len(right) + JUMP_SIZE))
  195. for opcode in right {
  196. append(&code, opcode)
  197. }
  198. case ^Node_Concatenation:
  199. for subnode in specific.nodes {
  200. subnode_code := generate_code(c, subnode)
  201. for opcode in subnode_code {
  202. append(&code, opcode)
  203. }
  204. }
  205. case ^Node_Repeat_Zero:
  206. code = generate_code(c, specific.inner)
  207. original_len := len(code)
  208. inject_at(&code, 0, Opcode.Split)
  209. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  210. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  211. append(&code, Opcode.Jump)
  212. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  213. case ^Node_Repeat_Zero_Non_Greedy:
  214. code = generate_code(c, specific.inner)
  215. original_len := len(code)
  216. inject_at(&code, 0, Opcode.Split)
  217. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  218. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  219. append(&code, Opcode.Jump)
  220. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  221. case ^Node_Repeat_One:
  222. code = generate_code(c, specific.inner)
  223. original_len := len(code)
  224. append(&code, Opcode.Split)
  225. append_raw(&code, i16(-original_len))
  226. append_raw(&code, i16(SPLIT_SIZE))
  227. case ^Node_Repeat_One_Non_Greedy:
  228. code = generate_code(c, specific.inner)
  229. original_len := len(code)
  230. append(&code, Opcode.Split)
  231. append_raw(&code, i16(SPLIT_SIZE))
  232. append_raw(&code, i16(-original_len))
  233. case ^Node_Repeat_N:
  234. inside := generate_code(c, specific.inner)
  235. original_len := len(inside)
  236. if specific.lower == specific.upper { // {N}
  237. // e{N} ... evaluates to ... e^N
  238. for i := 0; i < specific.upper; i += 1 {
  239. for opcode in inside {
  240. append(&code, opcode)
  241. }
  242. }
  243. } else if specific.lower == -1 && specific.upper > 0 { // {,M}
  244. // e{,M} ... evaluates to ... e?^M
  245. for i := 0; i < specific.upper; i += 1 {
  246. append(&code, Opcode.Split)
  247. append_raw(&code, i16(SPLIT_SIZE))
  248. append_raw(&code, i16(SPLIT_SIZE + original_len))
  249. for opcode in inside {
  250. append(&code, opcode)
  251. }
  252. }
  253. } else if specific.lower >= 0 && specific.upper == -1 { // {N,}
  254. // e{N,} ... evaluates to ... e^N e*
  255. for i := 0; i < specific.lower; i += 1 {
  256. for opcode in inside {
  257. append(&code, opcode)
  258. }
  259. }
  260. append(&code, Opcode.Split)
  261. append_raw(&code, i16(SPLIT_SIZE))
  262. append_raw(&code, i16(SPLIT_SIZE + original_len + JUMP_SIZE))
  263. for opcode in inside {
  264. append(&code, opcode)
  265. }
  266. append(&code, Opcode.Jump)
  267. append_raw(&code, i16(-original_len - SPLIT_SIZE))
  268. } else if specific.lower >= 0 && specific.upper > 0 {
  269. // e{N,M} evaluates to ... e^N e?^(M-N)
  270. for i := 0; i < specific.lower; i += 1 {
  271. for opcode in inside {
  272. append(&code, opcode)
  273. }
  274. }
  275. for i := 0; i < specific.upper - specific.lower; i += 1 {
  276. append(&code, Opcode.Split)
  277. append_raw(&code, i16(SPLIT_SIZE + original_len))
  278. append_raw(&code, i16(SPLIT_SIZE))
  279. for opcode in inside {
  280. append(&code, opcode)
  281. }
  282. }
  283. } else {
  284. panic("RegEx compiler received invalid repetition group.")
  285. }
  286. case ^Node_Optional:
  287. code = generate_code(c, specific.inner)
  288. original_len := len(code)
  289. inject_at(&code, 0, Opcode.Split)
  290. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE))
  291. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE + original_len))
  292. case ^Node_Optional_Non_Greedy:
  293. code = generate_code(c, specific.inner)
  294. original_len := len(code)
  295. inject_at(&code, 0, Opcode.Split)
  296. inject_raw(&code, size_of(byte) , i16(SPLIT_SIZE + original_len))
  297. inject_raw(&code, size_of(byte) + size_of(i16), i16(SPLIT_SIZE))
  298. case ^Node_Match_All_And_Escape:
  299. append(&code, Opcode.Match_All_And_Escape)
  300. }
  301. return
  302. }
  303. @require_results
  304. compile :: proc(tree: Node, flags: common.Flags) -> (code: Program, class_data: [dynamic]Rune_Class_Data, err: Error) {
  305. if tree == nil {
  306. if .No_Capture not_in flags {
  307. append(&code, Opcode.Save); append(&code, Opcode(0x00))
  308. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  309. append(&code, Opcode.Match)
  310. } else {
  311. append(&code, Opcode.Match_And_Exit)
  312. }
  313. return
  314. }
  315. c: Compiler
  316. c.flags = flags
  317. map_all_classes(tree, &class_data)
  318. if len(class_data) >= common.MAX_CLASSES {
  319. err = .Too_Many_Classes
  320. return
  321. }
  322. c.class_data = class_data
  323. code = generate_code(&c, tree)
  324. pc_open := 0
  325. add_global: if .Global in flags {
  326. // Check if the opening to the pattern is predictable.
  327. // If so, use one of the optimized Wait opcodes.
  328. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  329. seek_loop: for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  330. #partial switch opcode {
  331. case .Byte:
  332. inject_at(&code, pc_open, Opcode.Wait_For_Byte)
  333. pc_open += size_of(Opcode)
  334. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  335. pc_open += size_of(u8)
  336. break add_global
  337. case .Rune:
  338. operand := intrinsics.unaligned_load(cast(^rune)&code[pc+1])
  339. inject_at(&code, pc_open, Opcode.Wait_For_Rune)
  340. pc_open += size_of(Opcode)
  341. inject_raw(&code, pc_open, operand)
  342. pc_open += size_of(rune)
  343. break add_global
  344. case .Rune_Class:
  345. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class)
  346. pc_open += size_of(Opcode)
  347. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  348. pc_open += size_of(u8)
  349. break add_global
  350. case .Rune_Class_Negated:
  351. inject_at(&code, pc_open, Opcode.Wait_For_Rune_Class_Negated)
  352. pc_open += size_of(Opcode)
  353. inject_at(&code, pc_open, Opcode(code[pc + size_of(Opcode) + pc_open]))
  354. pc_open += size_of(u8)
  355. break add_global
  356. case .Save:
  357. continue
  358. case:
  359. break seek_loop
  360. }
  361. }
  362. // `.*?`
  363. inject_at(&code, pc_open, Opcode.Split)
  364. pc_open += size_of(byte)
  365. inject_raw(&code, pc_open, i16(SPLIT_SIZE + size_of(byte) + JUMP_SIZE))
  366. pc_open += size_of(i16)
  367. inject_raw(&code, pc_open, i16(SPLIT_SIZE))
  368. pc_open += size_of(i16)
  369. inject_at(&code, pc_open, Opcode.Wildcard)
  370. pc_open += size_of(byte)
  371. inject_at(&code, pc_open, Opcode.Jump)
  372. pc_open += size_of(byte)
  373. inject_raw(&code, pc_open, i16(-size_of(byte) - SPLIT_SIZE))
  374. pc_open += size_of(i16)
  375. }
  376. if .No_Capture not_in flags {
  377. // `(` <generated code>
  378. inject_at(&code, pc_open, Opcode.Save)
  379. inject_at(&code, pc_open + size_of(byte), Opcode(0x00))
  380. // `)`
  381. append(&code, Opcode.Save); append(&code, Opcode(0x01))
  382. append(&code, Opcode.Match)
  383. } else {
  384. append(&code, Opcode.Match_And_Exit)
  385. }
  386. if len(code) >= common.MAX_PROGRAM_SIZE {
  387. err = .Program_Too_Big
  388. return
  389. }
  390. // NOTE: No further opcode addition beyond this point, as we've already
  391. // checked the program size. Removal or transformation is fine.
  392. // Post-Compile Optimizations:
  393. // * Jump Extension
  394. //
  395. // A:RelJmp(1) -> B:RelJmp(2) => A:RelJmp(2)
  396. if .No_Optimization not_in flags {
  397. for passes_left := 1; passes_left > 0; passes_left -= 1 {
  398. do_another_pass := false
  399. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  400. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  401. #partial switch opcode {
  402. case .Jump:
  403. jmp := cast(^i16)&code[pc+size_of(Opcode)]
  404. if code[cast(i16)pc+jmp^] == .Jump {
  405. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp^+size_of(Opcode)])
  406. jmp^ = jmp^ + next_jmp
  407. do_another_pass = true
  408. }
  409. case .Split:
  410. jmp_x := cast(^i16)&code[pc+size_of(Opcode)]
  411. if code[cast(i16)pc+jmp_x^] == .Jump {
  412. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_x^+size_of(Opcode)])
  413. jmp_x^ = jmp_x^ + next_jmp
  414. do_another_pass = true
  415. }
  416. jmp_y := cast(^i16)&code[pc+size_of(Opcode)+size_of(i16)]
  417. if code[cast(i16)pc+jmp_y^] == .Jump {
  418. next_jmp := intrinsics.unaligned_load(cast(^i16)&code[cast(i16)pc+jmp_y^+size_of(Opcode)])
  419. jmp_y^ = jmp_y^ + next_jmp
  420. do_another_pass = true
  421. }
  422. }
  423. }
  424. if do_another_pass {
  425. passes_left += 1
  426. }
  427. }
  428. }
  429. // * Relative Jump to Absolute Jump
  430. //
  431. // RelJmp{PC +/- N} => AbsJmp{M}
  432. iter := virtual_machine.Opcode_Iterator{ code[:], 0 }
  433. for opcode, pc in virtual_machine.iterate_opcodes(&iter) {
  434. // NOTE: The virtual machine implementation depends on this.
  435. #partial switch opcode {
  436. case .Jump:
  437. jmp := cast(^u16)&code[pc+size_of(Opcode)]
  438. jmp^ = jmp^ + cast(u16)pc
  439. case .Split:
  440. jmp_x := cast(^u16)&code[pc+size_of(Opcode)]
  441. jmp_x^ = jmp_x^ + cast(u16)pc
  442. jmp_y := cast(^u16)&code[pc+size_of(Opcode)+size_of(i16)]
  443. jmp_y^ = jmp_y^ + cast(u16)pc
  444. }
  445. }
  446. return
  447. }