regexp.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2MatchCache struct {
  13. target valueString
  14. runes []rune
  15. posMap []int
  16. }
  17. // Not goroutine-safe. Use regexp2Wrapper.clone()
  18. type regexp2Wrapper struct {
  19. rx *regexp2.Regexp
  20. cache *regexp2MatchCache
  21. }
  22. type regexpWrapper regexp.Regexp
  23. type positionMapItem struct {
  24. src, dst int
  25. }
  26. type positionMap []positionMapItem
  27. func (m positionMap) get(src int) int {
  28. if src <= 0 {
  29. return src
  30. }
  31. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  32. if res >= len(m) || m[res].src != src {
  33. panic("index not found")
  34. }
  35. return m[res].dst
  36. }
  37. type arrayRuneReader struct {
  38. runes []rune
  39. pos int
  40. }
  41. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  42. if rd.pos < len(rd.runes) {
  43. r = rd.runes[rd.pos]
  44. size = 1
  45. rd.pos++
  46. } else {
  47. err = io.EOF
  48. }
  49. return
  50. }
  51. // Not goroutine-safe. Use regexpPattern.clone()
  52. type regexpPattern struct {
  53. src string
  54. global, ignoreCase, multiline, sticky, unicode bool
  55. regexpWrapper *regexpWrapper
  56. regexp2Wrapper *regexp2Wrapper
  57. }
  58. func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
  59. var opts regexp2.RegexOptions = regexp2.ECMAScript
  60. if multiline {
  61. opts |= regexp2.Multiline
  62. }
  63. if ignoreCase {
  64. opts |= regexp2.IgnoreCase
  65. }
  66. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  67. if err1 != nil {
  68. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  69. }
  70. return &regexp2Wrapper{rx: regexp2Pattern}, nil
  71. }
  72. func (p *regexpPattern) createRegexp2() {
  73. if p.regexp2Wrapper != nil {
  74. return
  75. }
  76. rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
  77. if err != nil {
  78. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  79. panic(err)
  80. }
  81. p.regexp2Wrapper = rx
  82. }
  83. func buildUTF8PosMap(s unicodeString) (positionMap, string) {
  84. pm := make(positionMap, 0, s.length())
  85. rd := s.reader()
  86. sPos, utf8Pos := 0, 0
  87. var sb strings.Builder
  88. for {
  89. r, size, err := rd.ReadRune()
  90. if err == io.EOF {
  91. break
  92. }
  93. if err != nil {
  94. // the string contains invalid UTF-16, bailing out
  95. return nil, ""
  96. }
  97. utf8Size, _ := sb.WriteRune(r)
  98. sPos += size
  99. utf8Pos += utf8Size
  100. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  101. }
  102. return pm, sb.String()
  103. }
  104. func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
  105. if p.regexpWrapper == nil {
  106. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  107. }
  108. if start != 0 {
  109. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  110. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  111. // work correctly.
  112. p.createRegexp2()
  113. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  114. }
  115. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  116. }
  117. func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int {
  118. if p.regexpWrapper == nil {
  119. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  120. }
  121. if start == 0 {
  122. a, u := devirtualizeString(s)
  123. if u == nil {
  124. return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky)
  125. }
  126. if limit == 1 {
  127. result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode)
  128. if result == nil {
  129. return nil
  130. }
  131. return [][]int{result}
  132. }
  133. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  134. // input.
  135. if p.unicode {
  136. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  137. pm, str := buildUTF8PosMap(u)
  138. if pm != nil {
  139. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  140. for _, result := range res {
  141. for i, idx := range result {
  142. result[i] = pm.get(idx)
  143. }
  144. }
  145. return res
  146. }
  147. }
  148. }
  149. p.createRegexp2()
  150. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  151. }
  152. // clone creates a copy of the regexpPattern which can be used concurrently.
  153. func (p *regexpPattern) clone() *regexpPattern {
  154. ret := &regexpPattern{
  155. src: p.src,
  156. global: p.global,
  157. ignoreCase: p.ignoreCase,
  158. multiline: p.multiline,
  159. sticky: p.sticky,
  160. unicode: p.unicode,
  161. }
  162. if p.regexpWrapper != nil {
  163. ret.regexpWrapper = p.regexpWrapper.clone()
  164. }
  165. if p.regexp2Wrapper != nil {
  166. ret.regexp2Wrapper = p.regexp2Wrapper.clone()
  167. }
  168. return ret
  169. }
  170. type regexpObject struct {
  171. baseObject
  172. pattern *regexpPattern
  173. source valueString
  174. standard bool
  175. }
  176. func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) {
  177. if fullUnicode {
  178. return r.findSubmatchIndexUnicode(s, start, doCache)
  179. }
  180. return r.findSubmatchIndexUTF16(s, start, doCache)
  181. }
  182. func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
  183. wrapped := r.rx
  184. cache := r.cache
  185. if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
  186. runes = cache.runes
  187. } else {
  188. runes = s.utf16Runes()
  189. cache = nil
  190. }
  191. match, err = wrapped.FindRunesMatchStartingAt(runes, start)
  192. if doCache && match != nil && err == nil {
  193. if cache == nil {
  194. if r.cache == nil {
  195. r.cache = new(regexp2MatchCache)
  196. }
  197. *r.cache = regexp2MatchCache{
  198. target: s,
  199. runes: runes,
  200. }
  201. }
  202. } else {
  203. r.cache = nil
  204. }
  205. return
  206. }
  207. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) {
  208. match, _, err := r.findUTF16Cached(s, start, doCache)
  209. if err != nil {
  210. return
  211. }
  212. if match == nil {
  213. return
  214. }
  215. groups := match.Groups()
  216. result = make([]int, 0, len(groups)<<1)
  217. for _, group := range groups {
  218. if len(group.Captures) > 0 {
  219. result = append(result, group.Index, group.Index+group.Length)
  220. } else {
  221. result = append(result, -1, 0)
  222. }
  223. }
  224. return
  225. }
  226. func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
  227. var (
  228. runes []rune
  229. mappedStart int
  230. splitPair bool
  231. savedRune rune
  232. )
  233. wrapped := r.rx
  234. cache := r.cache
  235. if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
  236. runes, posMap = cache.runes, cache.posMap
  237. mappedStart, splitPair = posMapReverseLookup(posMap, start)
  238. } else {
  239. posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.length(), start)
  240. cache = nil
  241. }
  242. if splitPair {
  243. // temporarily set the rune at mappedStart to the second code point of the pair
  244. _, second := utf16.EncodeRune(runes[mappedStart])
  245. savedRune, runes[mappedStart] = runes[mappedStart], second
  246. }
  247. match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  248. if doCache && match != nil && err == nil {
  249. if splitPair {
  250. runes[mappedStart] = savedRune
  251. }
  252. if cache == nil {
  253. if r.cache == nil {
  254. r.cache = new(regexp2MatchCache)
  255. }
  256. *r.cache = regexp2MatchCache{
  257. target: s,
  258. runes: runes,
  259. posMap: posMap,
  260. }
  261. }
  262. } else {
  263. r.cache = nil
  264. }
  265. return
  266. }
  267. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) {
  268. match, posMap, err := r.findUnicodeCached(s, start, doCache)
  269. if match == nil || err != nil {
  270. return
  271. }
  272. groups := match.Groups()
  273. result = make([]int, 0, len(groups)<<1)
  274. for _, group := range groups {
  275. if len(group.Captures) > 0 {
  276. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  277. } else {
  278. result = append(result, -1, 0)
  279. }
  280. }
  281. return
  282. }
  283. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
  284. wrapped := r.rx
  285. match, runes, err := r.findUTF16Cached(s, start, false)
  286. if match == nil || err != nil {
  287. return nil
  288. }
  289. if limit < 0 {
  290. limit = len(runes) + 1
  291. }
  292. results := make([][]int, 0, limit)
  293. for match != nil {
  294. groups := match.Groups()
  295. result := make([]int, 0, len(groups)<<1)
  296. for _, group := range groups {
  297. if len(group.Captures) > 0 {
  298. startPos := group.Index
  299. endPos := group.Index + group.Length
  300. result = append(result, startPos, endPos)
  301. } else {
  302. result = append(result, -1, 0)
  303. }
  304. }
  305. if sticky && len(result) > 1 {
  306. if result[0] != start {
  307. break
  308. }
  309. start = result[1]
  310. }
  311. results = append(results, result)
  312. limit--
  313. if limit <= 0 {
  314. break
  315. }
  316. match, err = wrapped.FindNextMatch(match)
  317. if err != nil {
  318. return nil
  319. }
  320. }
  321. return results
  322. }
  323. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
  324. posMap = make([]int, 0, l+1)
  325. curPos := 0
  326. runes = make([]rune, 0, l)
  327. startFound := false
  328. for {
  329. if !startFound {
  330. if curPos == start {
  331. mappedStart = len(runes)
  332. startFound = true
  333. }
  334. if curPos > start {
  335. // start position splits a surrogate pair
  336. mappedStart = len(runes) - 1
  337. splitPair = true
  338. startFound = true
  339. }
  340. }
  341. rn, size, err := rd.ReadRune()
  342. if err != nil {
  343. break
  344. }
  345. runes = append(runes, rn)
  346. posMap = append(posMap, curPos)
  347. curPos += size
  348. }
  349. posMap = append(posMap, curPos)
  350. return
  351. }
  352. func posMapReverseLookup(posMap []int, pos int) (int, bool) {
  353. mapped := sort.SearchInts(posMap, pos)
  354. if mapped < len(posMap) && posMap[mapped] != pos {
  355. return mapped - 1, true
  356. }
  357. return mapped, false
  358. }
  359. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  360. wrapped := r.rx
  361. if limit < 0 {
  362. limit = len(s) + 1
  363. }
  364. results := make([][]int, 0, limit)
  365. match, posMap, err := r.findUnicodeCached(s, start, false)
  366. if err != nil {
  367. return nil
  368. }
  369. for match != nil {
  370. groups := match.Groups()
  371. result := make([]int, 0, len(groups)<<1)
  372. for _, group := range groups {
  373. if len(group.Captures) > 0 {
  374. start := posMap[group.Index]
  375. end := posMap[group.Index+group.Length]
  376. result = append(result, start, end)
  377. } else {
  378. result = append(result, -1, 0)
  379. }
  380. }
  381. if sticky && len(result) > 1 {
  382. if result[0] != start {
  383. break
  384. }
  385. start = result[1]
  386. }
  387. results = append(results, result)
  388. match, err = wrapped.FindNextMatch(match)
  389. if err != nil {
  390. return nil
  391. }
  392. }
  393. return results
  394. }
  395. func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int {
  396. a, u := devirtualizeString(s)
  397. if u != nil {
  398. if fullUnicode {
  399. return r.findAllSubmatchIndexUnicode(u, start, limit, sticky)
  400. }
  401. return r.findAllSubmatchIndexUTF16(u, start, limit, sticky)
  402. }
  403. return r.findAllSubmatchIndexUTF16(a, start, limit, sticky)
  404. }
  405. func (r *regexp2Wrapper) clone() *regexp2Wrapper {
  406. return &regexp2Wrapper{
  407. rx: r.rx,
  408. }
  409. }
  410. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  411. wrapped := (*regexp.Regexp)(r)
  412. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  413. pos := 0
  414. if sticky {
  415. for i, result := range results {
  416. if len(result) > 1 {
  417. if result[0] != pos {
  418. return results[:i]
  419. }
  420. pos = result[1]
  421. }
  422. }
  423. }
  424. return
  425. }
  426. func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int {
  427. a, u := devirtualizeString(s)
  428. if u != nil {
  429. return r.findSubmatchIndexUnicode(u, fullUnicode)
  430. }
  431. return r.findSubmatchIndexASCII(string(a))
  432. }
  433. func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
  434. wrapped := (*regexp.Regexp)(r)
  435. return wrapped.FindStringSubmatchIndex(s)
  436. }
  437. func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
  438. wrapped := (*regexp.Regexp)(r)
  439. if fullUnicode {
  440. posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.length(), 0)
  441. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  442. for i, item := range res {
  443. if item >= 0 {
  444. res[i] = posMap[item]
  445. }
  446. }
  447. return res
  448. }
  449. return wrapped.FindReaderSubmatchIndex(s.utf16Reader())
  450. }
  451. func (r *regexpWrapper) clone() *regexpWrapper {
  452. return r
  453. }
  454. func (r *regexpObject) execResultToArray(target valueString, result []int) Value {
  455. captureCount := len(result) >> 1
  456. valueArray := make([]Value, captureCount)
  457. matchIndex := result[0]
  458. valueArray[0] = target.substring(result[0], result[1])
  459. lowerBound := 0
  460. for index := 1; index < captureCount; index++ {
  461. offset := index << 1
  462. if result[offset] >= 0 && result[offset+1] >= lowerBound {
  463. valueArray[index] = target.substring(result[offset], result[offset+1])
  464. lowerBound = result[offset]
  465. } else {
  466. valueArray[index] = _undefined
  467. }
  468. }
  469. match := r.val.runtime.newArrayValues(valueArray)
  470. match.self.setOwnStr("input", target, false)
  471. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  472. return match
  473. }
  474. func (r *regexpObject) getLastIndex() int64 {
  475. lastIndex := toLength(r.getStr("lastIndex", nil))
  476. if !r.pattern.global && !r.pattern.sticky {
  477. return 0
  478. }
  479. return lastIndex
  480. }
  481. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  482. if r.pattern.sticky {
  483. if firstResult == nil || int64(firstResult[0]) != index {
  484. r.setOwnStr("lastIndex", intToValue(0), true)
  485. return false
  486. }
  487. } else {
  488. if firstResult == nil {
  489. if r.pattern.global {
  490. r.setOwnStr("lastIndex", intToValue(0), true)
  491. }
  492. return false
  493. }
  494. }
  495. if r.pattern.global || r.pattern.sticky {
  496. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  497. }
  498. return true
  499. }
  500. func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) {
  501. index := r.getLastIndex()
  502. if index >= 0 && index <= int64(target.length()) {
  503. result = r.pattern.findSubmatchIndex(target, int(index))
  504. }
  505. match = r.updateLastIndex(index, result, result)
  506. return
  507. }
  508. func (r *regexpObject) exec(target valueString) Value {
  509. match, result := r.execRegexp(target)
  510. if match {
  511. return r.execResultToArray(target, result)
  512. }
  513. return _null
  514. }
  515. func (r *regexpObject) test(target valueString) bool {
  516. match, _ := r.execRegexp(target)
  517. return match
  518. }
  519. func (r *regexpObject) clone() *regexpObject {
  520. r1 := r.val.runtime.newRegexpObject(r.prototype)
  521. r1.source = r.source
  522. r1.pattern = r.pattern
  523. return r1
  524. }
  525. func (r *regexpObject) init() {
  526. r.baseObject.init()
  527. r.standard = true
  528. r._putProp("lastIndex", intToValue(0), true, false, false)
  529. }
  530. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  531. res := r.baseObject.setProto(proto, throw)
  532. if res {
  533. r.standard = false
  534. }
  535. return res
  536. }
  537. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  538. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  539. if res {
  540. r.standard = false
  541. }
  542. return res
  543. }
  544. func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
  545. res := r.baseObject.defineOwnPropertySym(name, desc, throw)
  546. if res && r.standard {
  547. switch name {
  548. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  549. r.standard = false
  550. }
  551. }
  552. return res
  553. }
  554. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  555. res := r.baseObject.deleteStr(name, throw)
  556. if res {
  557. r.standard = false
  558. }
  559. return res
  560. }
  561. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  562. res := r.baseObject.setOwnStr(name, value, throw)
  563. if res && r.standard && name == "exec" {
  564. r.standard = false
  565. }
  566. return res
  567. }
  568. func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
  569. res := r.baseObject.setOwnSym(name, value, throw)
  570. if res && r.standard {
  571. switch name {
  572. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  573. r.standard = false
  574. }
  575. }
  576. return res
  577. }