regexp.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2MatchCache struct {
  13. target String
  14. runes []rune
  15. posMap []int
  16. }
  17. // Not goroutine-safe. Use regexp2Wrapper.clone()
  18. type regexp2Wrapper struct {
  19. rx *regexp2.Regexp
  20. cache *regexp2MatchCache
  21. }
  22. type regexpWrapper regexp.Regexp
  23. type positionMapItem struct {
  24. src, dst int
  25. }
  26. type positionMap []positionMapItem
  27. func (m positionMap) get(src int) int {
  28. if src <= 0 {
  29. return src
  30. }
  31. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  32. if res >= len(m) || m[res].src != src {
  33. panic("index not found")
  34. }
  35. return m[res].dst
  36. }
  37. type arrayRuneReader struct {
  38. runes []rune
  39. pos int
  40. }
  41. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  42. if rd.pos < len(rd.runes) {
  43. r = rd.runes[rd.pos]
  44. size = 1
  45. rd.pos++
  46. } else {
  47. err = io.EOF
  48. }
  49. return
  50. }
  51. // Not goroutine-safe. Use regexpPattern.clone()
  52. type regexpPattern struct {
  53. src string
  54. global, ignoreCase, multiline, dotAll, sticky, unicode bool
  55. regexpWrapper *regexpWrapper
  56. regexp2Wrapper *regexp2Wrapper
  57. }
  58. func compileRegexp2(src string, multiline, dotAll, ignoreCase, unicode bool) (*regexp2Wrapper, error) {
  59. var opts regexp2.RegexOptions = regexp2.ECMAScript
  60. if multiline {
  61. opts |= regexp2.Multiline
  62. }
  63. if dotAll {
  64. opts |= regexp2.Singleline
  65. }
  66. if ignoreCase {
  67. opts |= regexp2.IgnoreCase
  68. }
  69. if unicode {
  70. opts |= regexp2.Unicode
  71. }
  72. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  73. if err1 != nil {
  74. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  75. }
  76. return &regexp2Wrapper{rx: regexp2Pattern}, nil
  77. }
  78. func (p *regexpPattern) createRegexp2() {
  79. if p.regexp2Wrapper != nil {
  80. return
  81. }
  82. rx, err := compileRegexp2(p.src, p.multiline, p.dotAll, p.ignoreCase, p.unicode)
  83. if err != nil {
  84. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  85. panic(err)
  86. }
  87. p.regexp2Wrapper = rx
  88. }
  89. func buildUTF8PosMap(s unicodeString) (positionMap, string) {
  90. pm := make(positionMap, 0, s.Length())
  91. rd := s.Reader()
  92. sPos, utf8Pos := 0, 0
  93. var sb strings.Builder
  94. for {
  95. r, size, err := rd.ReadRune()
  96. if err == io.EOF {
  97. break
  98. }
  99. if err != nil {
  100. // the string contains invalid UTF-16, bailing out
  101. return nil, ""
  102. }
  103. utf8Size, _ := sb.WriteRune(r)
  104. sPos += size
  105. utf8Pos += utf8Size
  106. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  107. }
  108. return pm, sb.String()
  109. }
  110. func (p *regexpPattern) findSubmatchIndex(s String, start int) []int {
  111. if p.regexpWrapper == nil {
  112. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  113. }
  114. if start != 0 {
  115. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  116. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  117. // work correctly.
  118. p.createRegexp2()
  119. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  120. }
  121. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  122. }
  123. func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int {
  124. if p.regexpWrapper == nil {
  125. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  126. }
  127. if start == 0 {
  128. a, u := devirtualizeString(s)
  129. if u == nil {
  130. return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky)
  131. }
  132. if limit == 1 {
  133. result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode)
  134. if result == nil {
  135. return nil
  136. }
  137. return [][]int{result}
  138. }
  139. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  140. // input.
  141. if p.unicode {
  142. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  143. pm, str := buildUTF8PosMap(u)
  144. if pm != nil {
  145. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  146. for _, result := range res {
  147. for i, idx := range result {
  148. result[i] = pm.get(idx)
  149. }
  150. }
  151. return res
  152. }
  153. }
  154. }
  155. p.createRegexp2()
  156. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  157. }
  158. // clone creates a copy of the regexpPattern which can be used concurrently.
  159. func (p *regexpPattern) clone() *regexpPattern {
  160. ret := &regexpPattern{
  161. src: p.src,
  162. global: p.global,
  163. ignoreCase: p.ignoreCase,
  164. multiline: p.multiline,
  165. dotAll: p.dotAll,
  166. sticky: p.sticky,
  167. unicode: p.unicode,
  168. }
  169. if p.regexpWrapper != nil {
  170. ret.regexpWrapper = p.regexpWrapper.clone()
  171. }
  172. if p.regexp2Wrapper != nil {
  173. ret.regexp2Wrapper = p.regexp2Wrapper.clone()
  174. }
  175. return ret
  176. }
  177. type regexpObject struct {
  178. baseObject
  179. pattern *regexpPattern
  180. source String
  181. standard bool
  182. }
  183. func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) {
  184. if fullUnicode {
  185. return r.findSubmatchIndexUnicode(s, start, doCache)
  186. }
  187. return r.findSubmatchIndexUTF16(s, start, doCache)
  188. }
  189. func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
  190. wrapped := r.rx
  191. cache := r.cache
  192. if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
  193. runes = cache.runes
  194. } else {
  195. runes = s.utf16Runes()
  196. cache = nil
  197. }
  198. match, err = wrapped.FindRunesMatchStartingAt(runes, start)
  199. if doCache && match != nil && err == nil {
  200. if cache == nil {
  201. if r.cache == nil {
  202. r.cache = new(regexp2MatchCache)
  203. }
  204. *r.cache = regexp2MatchCache{
  205. target: s,
  206. runes: runes,
  207. }
  208. }
  209. } else {
  210. r.cache = nil
  211. }
  212. return
  213. }
  214. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) {
  215. match, _, err := r.findUTF16Cached(s, start, doCache)
  216. if err != nil {
  217. return
  218. }
  219. if match == nil {
  220. return
  221. }
  222. groups := match.Groups()
  223. result = make([]int, 0, len(groups)<<1)
  224. for _, group := range groups {
  225. if len(group.Captures) > 0 {
  226. result = append(result, group.Index, group.Index+group.Length)
  227. } else {
  228. result = append(result, -1, 0)
  229. }
  230. }
  231. return
  232. }
  233. func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
  234. var (
  235. runes []rune
  236. mappedStart int
  237. splitPair bool
  238. savedRune rune
  239. )
  240. wrapped := r.rx
  241. cache := r.cache
  242. if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
  243. runes, posMap = cache.runes, cache.posMap
  244. mappedStart, splitPair = posMapReverseLookup(posMap, start)
  245. } else {
  246. posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start)
  247. cache = nil
  248. }
  249. if splitPair {
  250. // temporarily set the rune at mappedStart to the second code point of the pair
  251. _, second := utf16.EncodeRune(runes[mappedStart])
  252. savedRune, runes[mappedStart] = runes[mappedStart], second
  253. }
  254. match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  255. if doCache && match != nil && err == nil {
  256. if splitPair {
  257. runes[mappedStart] = savedRune
  258. }
  259. if cache == nil {
  260. if r.cache == nil {
  261. r.cache = new(regexp2MatchCache)
  262. }
  263. *r.cache = regexp2MatchCache{
  264. target: s,
  265. runes: runes,
  266. posMap: posMap,
  267. }
  268. }
  269. } else {
  270. r.cache = nil
  271. }
  272. return
  273. }
  274. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) {
  275. match, posMap, err := r.findUnicodeCached(s, start, doCache)
  276. if match == nil || err != nil {
  277. return
  278. }
  279. groups := match.Groups()
  280. result = make([]int, 0, len(groups)<<1)
  281. for _, group := range groups {
  282. if len(group.Captures) > 0 {
  283. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  284. } else {
  285. result = append(result, -1, 0)
  286. }
  287. }
  288. return
  289. }
  290. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int {
  291. wrapped := r.rx
  292. match, runes, err := r.findUTF16Cached(s, start, false)
  293. if match == nil || err != nil {
  294. return nil
  295. }
  296. if limit < 0 {
  297. limit = len(runes) + 1
  298. }
  299. results := make([][]int, 0, limit)
  300. for match != nil {
  301. groups := match.Groups()
  302. result := make([]int, 0, len(groups)<<1)
  303. for _, group := range groups {
  304. if len(group.Captures) > 0 {
  305. startPos := group.Index
  306. endPos := group.Index + group.Length
  307. result = append(result, startPos, endPos)
  308. } else {
  309. result = append(result, -1, 0)
  310. }
  311. }
  312. if sticky && len(result) > 1 {
  313. if result[0] != start {
  314. break
  315. }
  316. start = result[1]
  317. }
  318. results = append(results, result)
  319. limit--
  320. if limit <= 0 {
  321. break
  322. }
  323. match, err = wrapped.FindNextMatch(match)
  324. if err != nil {
  325. return nil
  326. }
  327. }
  328. return results
  329. }
  330. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
  331. posMap = make([]int, 0, l+1)
  332. curPos := 0
  333. runes = make([]rune, 0, l)
  334. startFound := false
  335. for {
  336. if !startFound {
  337. if curPos == start {
  338. mappedStart = len(runes)
  339. startFound = true
  340. }
  341. if curPos > start {
  342. // start position splits a surrogate pair
  343. mappedStart = len(runes) - 1
  344. splitPair = true
  345. startFound = true
  346. }
  347. }
  348. rn, size, err := rd.ReadRune()
  349. if err != nil {
  350. break
  351. }
  352. runes = append(runes, rn)
  353. posMap = append(posMap, curPos)
  354. curPos += size
  355. }
  356. posMap = append(posMap, curPos)
  357. return
  358. }
  359. func posMapReverseLookup(posMap []int, pos int) (int, bool) {
  360. mapped := sort.SearchInts(posMap, pos)
  361. if mapped < len(posMap) && posMap[mapped] != pos {
  362. return mapped - 1, true
  363. }
  364. return mapped, false
  365. }
  366. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  367. wrapped := r.rx
  368. if limit < 0 {
  369. limit = len(s) + 1
  370. }
  371. results := make([][]int, 0, limit)
  372. match, posMap, err := r.findUnicodeCached(s, start, false)
  373. if err != nil {
  374. return nil
  375. }
  376. for match != nil {
  377. groups := match.Groups()
  378. result := make([]int, 0, len(groups)<<1)
  379. for _, group := range groups {
  380. if len(group.Captures) > 0 {
  381. start := posMap[group.Index]
  382. end := posMap[group.Index+group.Length]
  383. result = append(result, start, end)
  384. } else {
  385. result = append(result, -1, 0)
  386. }
  387. }
  388. if sticky && len(result) > 1 {
  389. if result[0] != start {
  390. break
  391. }
  392. start = result[1]
  393. }
  394. results = append(results, result)
  395. match, err = wrapped.FindNextMatch(match)
  396. if err != nil {
  397. return nil
  398. }
  399. }
  400. return results
  401. }
  402. func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int {
  403. a, u := devirtualizeString(s)
  404. if u != nil {
  405. if fullUnicode {
  406. return r.findAllSubmatchIndexUnicode(u, start, limit, sticky)
  407. }
  408. return r.findAllSubmatchIndexUTF16(u, start, limit, sticky)
  409. }
  410. return r.findAllSubmatchIndexUTF16(a, start, limit, sticky)
  411. }
  412. func (r *regexp2Wrapper) clone() *regexp2Wrapper {
  413. return &regexp2Wrapper{
  414. rx: r.rx,
  415. }
  416. }
  417. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  418. wrapped := (*regexp.Regexp)(r)
  419. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  420. pos := 0
  421. if sticky {
  422. for i, result := range results {
  423. if len(result) > 1 {
  424. if result[0] != pos {
  425. return results[:i]
  426. }
  427. pos = result[1]
  428. }
  429. }
  430. }
  431. return
  432. }
  433. func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int {
  434. a, u := devirtualizeString(s)
  435. if u != nil {
  436. return r.findSubmatchIndexUnicode(u, fullUnicode)
  437. }
  438. return r.findSubmatchIndexASCII(string(a))
  439. }
  440. func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
  441. wrapped := (*regexp.Regexp)(r)
  442. return wrapped.FindStringSubmatchIndex(s)
  443. }
  444. func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
  445. wrapped := (*regexp.Regexp)(r)
  446. if fullUnicode {
  447. posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0)
  448. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  449. for i, item := range res {
  450. if item >= 0 {
  451. res[i] = posMap[item]
  452. }
  453. }
  454. return res
  455. }
  456. return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader())
  457. }
  458. func (r *regexpWrapper) clone() *regexpWrapper {
  459. return r
  460. }
  461. func (r *regexpObject) execResultToArray(target String, result []int) Value {
  462. captureCount := len(result) >> 1
  463. valueArray := make([]Value, captureCount)
  464. matchIndex := result[0]
  465. valueArray[0] = target.Substring(result[0], result[1])
  466. lowerBound := 0
  467. for index := 1; index < captureCount; index++ {
  468. offset := index << 1
  469. if result[offset] >= 0 && result[offset+1] >= lowerBound {
  470. valueArray[index] = target.Substring(result[offset], result[offset+1])
  471. lowerBound = result[offset]
  472. } else {
  473. valueArray[index] = _undefined
  474. }
  475. }
  476. match := r.val.runtime.newArrayValues(valueArray)
  477. match.self.setOwnStr("input", target, false)
  478. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  479. return match
  480. }
  481. func (r *regexpObject) getLastIndex() int64 {
  482. lastIndex := toLength(r.getStr("lastIndex", nil))
  483. if !r.pattern.global && !r.pattern.sticky {
  484. return 0
  485. }
  486. return lastIndex
  487. }
  488. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  489. if r.pattern.sticky {
  490. if firstResult == nil || int64(firstResult[0]) != index {
  491. r.setOwnStr("lastIndex", intToValue(0), true)
  492. return false
  493. }
  494. } else {
  495. if firstResult == nil {
  496. if r.pattern.global {
  497. r.setOwnStr("lastIndex", intToValue(0), true)
  498. }
  499. return false
  500. }
  501. }
  502. if r.pattern.global || r.pattern.sticky {
  503. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  504. }
  505. return true
  506. }
  507. func (r *regexpObject) execRegexp(target String) (match bool, result []int) {
  508. index := r.getLastIndex()
  509. if index >= 0 && index <= int64(target.Length()) {
  510. result = r.pattern.findSubmatchIndex(target, int(index))
  511. }
  512. match = r.updateLastIndex(index, result, result)
  513. return
  514. }
  515. func (r *regexpObject) exec(target String) Value {
  516. match, result := r.execRegexp(target)
  517. if match {
  518. return r.execResultToArray(target, result)
  519. }
  520. return _null
  521. }
  522. func (r *regexpObject) test(target String) bool {
  523. match, _ := r.execRegexp(target)
  524. return match
  525. }
  526. func (r *regexpObject) clone() *regexpObject {
  527. r1 := r.val.runtime.newRegexpObject(r.prototype)
  528. r1.source = r.source
  529. r1.pattern = r.pattern
  530. return r1
  531. }
  532. func (r *regexpObject) init() {
  533. r.baseObject.init()
  534. r.standard = true
  535. r._putProp("lastIndex", intToValue(0), true, false, false)
  536. }
  537. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  538. res := r.baseObject.setProto(proto, throw)
  539. if res {
  540. r.standard = false
  541. }
  542. return res
  543. }
  544. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  545. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  546. if res {
  547. r.standard = false
  548. }
  549. return res
  550. }
  551. func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
  552. res := r.baseObject.defineOwnPropertySym(name, desc, throw)
  553. if res && r.standard {
  554. switch name {
  555. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  556. r.standard = false
  557. }
  558. }
  559. return res
  560. }
  561. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  562. res := r.baseObject.deleteStr(name, throw)
  563. if res {
  564. r.standard = false
  565. }
  566. return res
  567. }
  568. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  569. res := r.baseObject.setOwnStr(name, value, throw)
  570. if res && r.standard && name == "exec" {
  571. r.standard = false
  572. }
  573. return res
  574. }
  575. func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
  576. res := r.baseObject.setOwnSym(name, value, throw)
  577. if res && r.standard {
  578. switch name {
  579. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  580. r.standard = false
  581. }
  582. }
  583. return res
  584. }