regexp.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2MatchCache struct {
  13. target valueString
  14. runes []rune
  15. posMap []int
  16. }
  17. // Not goroutine-safe. Use regexp2Wrapper.clone()
  18. type regexp2Wrapper struct {
  19. rx *regexp2.Regexp
  20. cache *regexp2MatchCache
  21. }
  22. type regexpWrapper regexp.Regexp
  23. type positionMapItem struct {
  24. src, dst int
  25. }
  26. type positionMap []positionMapItem
  27. func (m positionMap) get(src int) int {
  28. if src <= 0 {
  29. return src
  30. }
  31. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  32. if res >= len(m) || m[res].src != src {
  33. panic("index not found")
  34. }
  35. return m[res].dst
  36. }
  37. type arrayRuneReader struct {
  38. runes []rune
  39. pos int
  40. }
  41. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  42. if rd.pos < len(rd.runes) {
  43. r = rd.runes[rd.pos]
  44. size = 1
  45. rd.pos++
  46. } else {
  47. err = io.EOF
  48. }
  49. return
  50. }
  51. // Not goroutine-safe. Use regexpPattern.clone()
  52. type regexpPattern struct {
  53. src string
  54. global, ignoreCase, multiline, sticky, unicode bool
  55. regexpWrapper *regexpWrapper
  56. regexp2Wrapper *regexp2Wrapper
  57. }
  58. func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
  59. var opts regexp2.RegexOptions = regexp2.ECMAScript
  60. if multiline {
  61. opts |= regexp2.Multiline
  62. }
  63. if ignoreCase {
  64. opts |= regexp2.IgnoreCase
  65. }
  66. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  67. if err1 != nil {
  68. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  69. }
  70. return &regexp2Wrapper{rx: regexp2Pattern}, nil
  71. }
  72. func (p *regexpPattern) createRegexp2() {
  73. if p.regexp2Wrapper != nil {
  74. return
  75. }
  76. rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
  77. if err != nil {
  78. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  79. panic(err)
  80. }
  81. p.regexp2Wrapper = rx
  82. }
  83. func buildUTF8PosMap(s valueString) (positionMap, string) {
  84. pm := make(positionMap, 0, s.length())
  85. rd := s.reader(0)
  86. sPos, utf8Pos := 0, 0
  87. var sb strings.Builder
  88. for {
  89. r, size, err := rd.ReadRune()
  90. if err == io.EOF {
  91. break
  92. }
  93. if err != nil {
  94. // the string contains invalid UTF-16, bailing out
  95. return nil, ""
  96. }
  97. utf8Size, _ := sb.WriteRune(r)
  98. sPos += size
  99. utf8Pos += utf8Size
  100. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  101. }
  102. return pm, sb.String()
  103. }
  104. func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
  105. if p.regexpWrapper == nil {
  106. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  107. }
  108. if start != 0 {
  109. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  110. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  111. // work correctly.
  112. p.createRegexp2()
  113. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  114. }
  115. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  116. }
  117. func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int {
  118. if p.regexpWrapper == nil {
  119. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  120. }
  121. if start == 0 {
  122. if s, ok := s.(asciiString); ok {
  123. return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky)
  124. }
  125. if limit == 1 {
  126. result := p.regexpWrapper.findSubmatchIndexUnicode(s.(unicodeString), p.unicode)
  127. if result == nil {
  128. return nil
  129. }
  130. return [][]int{result}
  131. }
  132. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  133. // input.
  134. if p.unicode {
  135. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  136. pm, str := buildUTF8PosMap(s)
  137. if pm != nil {
  138. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  139. for _, result := range res {
  140. for i, idx := range result {
  141. result[i] = pm.get(idx)
  142. }
  143. }
  144. return res
  145. }
  146. }
  147. }
  148. p.createRegexp2()
  149. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  150. }
  151. // clone creates a copy of the regexpPattern which can be used concurrently.
  152. func (p *regexpPattern) clone() *regexpPattern {
  153. ret := &regexpPattern{
  154. src: p.src,
  155. global: p.global,
  156. ignoreCase: p.ignoreCase,
  157. multiline: p.multiline,
  158. sticky: p.sticky,
  159. unicode: p.unicode,
  160. }
  161. if p.regexpWrapper != nil {
  162. ret.regexpWrapper = p.regexpWrapper.clone()
  163. }
  164. if p.regexp2Wrapper != nil {
  165. ret.regexp2Wrapper = p.regexp2Wrapper.clone()
  166. }
  167. return ret
  168. }
  169. type regexpObject struct {
  170. baseObject
  171. pattern *regexpPattern
  172. source valueString
  173. standard bool
  174. }
  175. func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) {
  176. if fullUnicode {
  177. return r.findSubmatchIndexUnicode(s, start, doCache)
  178. }
  179. return r.findSubmatchIndexUTF16(s, start, doCache)
  180. }
  181. func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
  182. wrapped := r.rx
  183. cache := r.cache
  184. if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
  185. runes = cache.runes
  186. } else {
  187. runes = s.utf16Runes()
  188. cache = nil
  189. }
  190. match, err = wrapped.FindRunesMatchStartingAt(runes, start)
  191. if doCache && match != nil && err == nil {
  192. if cache == nil {
  193. if r.cache == nil {
  194. r.cache = new(regexp2MatchCache)
  195. }
  196. *r.cache = regexp2MatchCache{
  197. target: s,
  198. runes: runes,
  199. }
  200. }
  201. } else {
  202. r.cache = nil
  203. }
  204. return
  205. }
  206. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) {
  207. match, _, err := r.findUTF16Cached(s, start, doCache)
  208. if err != nil {
  209. return
  210. }
  211. if match == nil {
  212. return
  213. }
  214. groups := match.Groups()
  215. result = make([]int, 0, len(groups)<<1)
  216. for _, group := range groups {
  217. if len(group.Captures) > 0 {
  218. result = append(result, group.Index, group.Index+group.Length)
  219. } else {
  220. result = append(result, -1, 0)
  221. }
  222. }
  223. return
  224. }
  225. func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
  226. var (
  227. runes []rune
  228. mappedStart int
  229. splitPair bool
  230. savedRune rune
  231. )
  232. wrapped := r.rx
  233. cache := r.cache
  234. if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
  235. runes, posMap = cache.runes, cache.posMap
  236. mappedStart, splitPair = posMapReverseLookup(posMap, start)
  237. } else {
  238. posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
  239. cache = nil
  240. }
  241. if splitPair {
  242. // temporarily set the rune at mappedStart to the second code point of the pair
  243. _, second := utf16.EncodeRune(runes[mappedStart])
  244. savedRune, runes[mappedStart] = runes[mappedStart], second
  245. }
  246. match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  247. if doCache && match != nil && err == nil {
  248. if splitPair {
  249. runes[mappedStart] = savedRune
  250. }
  251. if cache == nil {
  252. if r.cache == nil {
  253. r.cache = new(regexp2MatchCache)
  254. }
  255. *r.cache = regexp2MatchCache{
  256. target: s,
  257. runes: runes,
  258. posMap: posMap,
  259. }
  260. }
  261. } else {
  262. r.cache = nil
  263. }
  264. return
  265. }
  266. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) {
  267. match, posMap, err := r.findUnicodeCached(s, start, doCache)
  268. if match == nil || err != nil {
  269. return
  270. }
  271. groups := match.Groups()
  272. result = make([]int, 0, len(groups)<<1)
  273. for _, group := range groups {
  274. if len(group.Captures) > 0 {
  275. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  276. } else {
  277. result = append(result, -1, 0)
  278. }
  279. }
  280. return
  281. }
  282. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
  283. wrapped := r.rx
  284. match, runes, err := r.findUTF16Cached(s, start, false)
  285. if match == nil || err != nil {
  286. return nil
  287. }
  288. if limit < 0 {
  289. limit = len(runes) + 1
  290. }
  291. results := make([][]int, 0, limit)
  292. for match != nil {
  293. groups := match.Groups()
  294. result := make([]int, 0, len(groups)<<1)
  295. for _, group := range groups {
  296. if len(group.Captures) > 0 {
  297. startPos := group.Index
  298. endPos := group.Index + group.Length
  299. result = append(result, startPos, endPos)
  300. } else {
  301. result = append(result, -1, 0)
  302. }
  303. }
  304. if sticky && len(result) > 1 {
  305. if result[0] != start {
  306. break
  307. }
  308. start = result[1]
  309. }
  310. results = append(results, result)
  311. limit--
  312. if limit <= 0 {
  313. break
  314. }
  315. match, err = wrapped.FindNextMatch(match)
  316. if err != nil {
  317. return nil
  318. }
  319. }
  320. return results
  321. }
  322. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
  323. posMap = make([]int, 0, l+1)
  324. curPos := 0
  325. runes = make([]rune, 0, l)
  326. startFound := false
  327. for {
  328. if !startFound {
  329. if curPos == start {
  330. mappedStart = len(runes)
  331. startFound = true
  332. }
  333. if curPos > start {
  334. // start position splits a surrogate pair
  335. mappedStart = len(runes) - 1
  336. splitPair = true
  337. startFound = true
  338. }
  339. }
  340. rn, size, err := rd.ReadRune()
  341. if err != nil {
  342. break
  343. }
  344. runes = append(runes, rn)
  345. posMap = append(posMap, curPos)
  346. curPos += size
  347. }
  348. posMap = append(posMap, curPos)
  349. return
  350. }
  351. func posMapReverseLookup(posMap []int, pos int) (int, bool) {
  352. mapped := sort.SearchInts(posMap, pos)
  353. if mapped < len(posMap) && posMap[mapped] != pos {
  354. return mapped - 1, true
  355. }
  356. return mapped, false
  357. }
  358. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  359. wrapped := r.rx
  360. if limit < 0 {
  361. limit = len(s) + 1
  362. }
  363. results := make([][]int, 0, limit)
  364. match, posMap, err := r.findUnicodeCached(s, start, false)
  365. if err != nil {
  366. return nil
  367. }
  368. for match != nil {
  369. groups := match.Groups()
  370. result := make([]int, 0, len(groups)<<1)
  371. for _, group := range groups {
  372. if len(group.Captures) > 0 {
  373. start := posMap[group.Index]
  374. end := posMap[group.Index+group.Length]
  375. result = append(result, start, end)
  376. } else {
  377. result = append(result, -1, 0)
  378. }
  379. }
  380. if sticky && len(result) > 1 {
  381. if result[0] != start {
  382. break
  383. }
  384. start = result[1]
  385. }
  386. results = append(results, result)
  387. match, err = wrapped.FindNextMatch(match)
  388. if err != nil {
  389. return nil
  390. }
  391. }
  392. return results
  393. }
  394. func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int {
  395. switch s := s.(type) {
  396. case asciiString:
  397. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  398. case unicodeString:
  399. if fullUnicode {
  400. return r.findAllSubmatchIndexUnicode(s, start, limit, sticky)
  401. }
  402. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  403. default:
  404. panic("Unsupported string type")
  405. }
  406. }
  407. func (r *regexp2Wrapper) clone() *regexp2Wrapper {
  408. return &regexp2Wrapper{
  409. rx: r.rx,
  410. }
  411. }
  412. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  413. wrapped := (*regexp.Regexp)(r)
  414. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  415. pos := 0
  416. if sticky {
  417. for i, result := range results {
  418. if len(result) > 1 {
  419. if result[0] != pos {
  420. return results[:i]
  421. }
  422. pos = result[1]
  423. }
  424. }
  425. }
  426. return
  427. }
  428. func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int {
  429. switch s := s.(type) {
  430. case asciiString:
  431. return r.findSubmatchIndexASCII(string(s))
  432. case unicodeString:
  433. return r.findSubmatchIndexUnicode(s, fullUnicode)
  434. default:
  435. panic("Unsupported string type")
  436. }
  437. }
  438. func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
  439. wrapped := (*regexp.Regexp)(r)
  440. return wrapped.FindStringSubmatchIndex(s)
  441. }
  442. func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
  443. wrapped := (*regexp.Regexp)(r)
  444. if fullUnicode {
  445. posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
  446. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  447. for i, item := range res {
  448. if item >= 0 {
  449. res[i] = posMap[item]
  450. }
  451. }
  452. return res
  453. }
  454. return wrapped.FindReaderSubmatchIndex(s.utf16Reader(0))
  455. }
  456. func (r *regexpWrapper) clone() *regexpWrapper {
  457. return r
  458. }
  459. func (r *regexpObject) execResultToArray(target valueString, result []int) Value {
  460. captureCount := len(result) >> 1
  461. valueArray := make([]Value, captureCount)
  462. matchIndex := result[0]
  463. valueArray[0] = target.substring(result[0], result[1])
  464. lowerBound := 0
  465. for index := 1; index < captureCount; index++ {
  466. offset := index << 1
  467. if result[offset] >= 0 && result[offset+1] >= lowerBound {
  468. valueArray[index] = target.substring(result[offset], result[offset+1])
  469. lowerBound = result[offset]
  470. } else {
  471. valueArray[index] = _undefined
  472. }
  473. }
  474. match := r.val.runtime.newArrayValues(valueArray)
  475. match.self.setOwnStr("input", target, false)
  476. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  477. return match
  478. }
  479. func (r *regexpObject) getLastIndex() int64 {
  480. lastIndex := toLength(r.getStr("lastIndex", nil))
  481. if !r.pattern.global && !r.pattern.sticky {
  482. return 0
  483. }
  484. return lastIndex
  485. }
  486. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  487. if r.pattern.sticky {
  488. if firstResult == nil || int64(firstResult[0]) != index {
  489. r.setOwnStr("lastIndex", intToValue(0), true)
  490. return false
  491. }
  492. } else {
  493. if firstResult == nil {
  494. if r.pattern.global {
  495. r.setOwnStr("lastIndex", intToValue(0), true)
  496. }
  497. return false
  498. }
  499. }
  500. if r.pattern.global || r.pattern.sticky {
  501. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  502. }
  503. return true
  504. }
  505. func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) {
  506. index := r.getLastIndex()
  507. if index >= 0 && index <= int64(target.length()) {
  508. result = r.pattern.findSubmatchIndex(target, int(index))
  509. }
  510. match = r.updateLastIndex(index, result, result)
  511. return
  512. }
  513. func (r *regexpObject) exec(target valueString) Value {
  514. match, result := r.execRegexp(target)
  515. if match {
  516. return r.execResultToArray(target, result)
  517. }
  518. return _null
  519. }
  520. func (r *regexpObject) test(target valueString) bool {
  521. match, _ := r.execRegexp(target)
  522. return match
  523. }
  524. func (r *regexpObject) clone() *regexpObject {
  525. r1 := r.val.runtime.newRegexpObject(r.prototype)
  526. r1.source = r.source
  527. r1.pattern = r.pattern
  528. return r1
  529. }
  530. func (r *regexpObject) init() {
  531. r.baseObject.init()
  532. r.standard = true
  533. r._putProp("lastIndex", intToValue(0), true, false, false)
  534. }
  535. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  536. res := r.baseObject.setProto(proto, throw)
  537. if res {
  538. r.standard = false
  539. }
  540. return res
  541. }
  542. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  543. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  544. if res {
  545. r.standard = false
  546. }
  547. return res
  548. }
  549. func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
  550. res := r.baseObject.defineOwnPropertySym(name, desc, throw)
  551. if res && r.standard {
  552. switch name {
  553. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  554. r.standard = false
  555. }
  556. }
  557. return res
  558. }
  559. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  560. res := r.baseObject.deleteStr(name, throw)
  561. if res {
  562. r.standard = false
  563. }
  564. return res
  565. }
  566. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  567. res := r.baseObject.setOwnStr(name, value, throw)
  568. if res && r.standard && name == "exec" {
  569. r.standard = false
  570. }
  571. return res
  572. }
  573. func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
  574. res := r.baseObject.setOwnSym(name, value, throw)
  575. if res && r.standard {
  576. switch name {
  577. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  578. r.standard = false
  579. }
  580. }
  581. return res
  582. }