regexp.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2MatchCache struct {
  13. target String
  14. runes []rune
  15. posMap []int
  16. }
  17. // Not goroutine-safe. Use regexp2Wrapper.clone()
  18. type regexp2Wrapper struct {
  19. rx *regexp2.Regexp
  20. cache *regexp2MatchCache
  21. }
  22. type regexpWrapper regexp.Regexp
  23. type positionMapItem struct {
  24. src, dst int
  25. }
  26. type positionMap []positionMapItem
  27. func (m positionMap) get(src int) int {
  28. if src <= 0 {
  29. return src
  30. }
  31. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  32. if res >= len(m) || m[res].src != src {
  33. panic("index not found")
  34. }
  35. return m[res].dst
  36. }
  37. type arrayRuneReader struct {
  38. runes []rune
  39. pos int
  40. }
  41. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  42. if rd.pos < len(rd.runes) {
  43. r = rd.runes[rd.pos]
  44. size = 1
  45. rd.pos++
  46. } else {
  47. err = io.EOF
  48. }
  49. return
  50. }
  51. // Not goroutine-safe. Use regexpPattern.clone()
  52. type regexpPattern struct {
  53. src string
  54. global, ignoreCase, multiline, dotAll, sticky, unicode bool
  55. regexpWrapper *regexpWrapper
  56. regexp2Wrapper *regexp2Wrapper
  57. }
  58. func compileRegexp2(src string, multiline, dotAll, ignoreCase bool) (*regexp2Wrapper, error) {
  59. var opts regexp2.RegexOptions = regexp2.ECMAScript
  60. if multiline {
  61. opts |= regexp2.Multiline
  62. }
  63. if dotAll {
  64. opts |= regexp2.Singleline
  65. }
  66. if ignoreCase {
  67. opts |= regexp2.IgnoreCase
  68. }
  69. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  70. if err1 != nil {
  71. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  72. }
  73. return &regexp2Wrapper{rx: regexp2Pattern}, nil
  74. }
  75. func (p *regexpPattern) createRegexp2() {
  76. if p.regexp2Wrapper != nil {
  77. return
  78. }
  79. rx, err := compileRegexp2(p.src, p.multiline, p.dotAll, p.ignoreCase)
  80. if err != nil {
  81. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  82. panic(err)
  83. }
  84. p.regexp2Wrapper = rx
  85. }
  86. func buildUTF8PosMap(s unicodeString) (positionMap, string) {
  87. pm := make(positionMap, 0, s.Length())
  88. rd := s.Reader()
  89. sPos, utf8Pos := 0, 0
  90. var sb strings.Builder
  91. for {
  92. r, size, err := rd.ReadRune()
  93. if err == io.EOF {
  94. break
  95. }
  96. if err != nil {
  97. // the string contains invalid UTF-16, bailing out
  98. return nil, ""
  99. }
  100. utf8Size, _ := sb.WriteRune(r)
  101. sPos += size
  102. utf8Pos += utf8Size
  103. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  104. }
  105. return pm, sb.String()
  106. }
  107. func (p *regexpPattern) findSubmatchIndex(s String, start int) []int {
  108. if p.regexpWrapper == nil {
  109. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  110. }
  111. if start != 0 {
  112. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  113. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  114. // work correctly.
  115. p.createRegexp2()
  116. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  117. }
  118. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  119. }
  120. func (p *regexpPattern) findAllSubmatchIndex(s String, start int, limit int, sticky bool) [][]int {
  121. if p.regexpWrapper == nil {
  122. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  123. }
  124. if start == 0 {
  125. a, u := devirtualizeString(s)
  126. if u == nil {
  127. return p.regexpWrapper.findAllSubmatchIndex(string(a), limit, sticky)
  128. }
  129. if limit == 1 {
  130. result := p.regexpWrapper.findSubmatchIndexUnicode(u, p.unicode)
  131. if result == nil {
  132. return nil
  133. }
  134. return [][]int{result}
  135. }
  136. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  137. // input.
  138. if p.unicode {
  139. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  140. pm, str := buildUTF8PosMap(u)
  141. if pm != nil {
  142. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  143. for _, result := range res {
  144. for i, idx := range result {
  145. result[i] = pm.get(idx)
  146. }
  147. }
  148. return res
  149. }
  150. }
  151. }
  152. p.createRegexp2()
  153. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  154. }
  155. // clone creates a copy of the regexpPattern which can be used concurrently.
  156. func (p *regexpPattern) clone() *regexpPattern {
  157. ret := &regexpPattern{
  158. src: p.src,
  159. global: p.global,
  160. ignoreCase: p.ignoreCase,
  161. multiline: p.multiline,
  162. dotAll: p.dotAll,
  163. sticky: p.sticky,
  164. unicode: p.unicode,
  165. }
  166. if p.regexpWrapper != nil {
  167. ret.regexpWrapper = p.regexpWrapper.clone()
  168. }
  169. if p.regexp2Wrapper != nil {
  170. ret.regexp2Wrapper = p.regexp2Wrapper.clone()
  171. }
  172. return ret
  173. }
  174. type regexpObject struct {
  175. baseObject
  176. pattern *regexpPattern
  177. source String
  178. standard bool
  179. }
  180. func (r *regexp2Wrapper) findSubmatchIndex(s String, start int, fullUnicode, doCache bool) (result []int) {
  181. if fullUnicode {
  182. return r.findSubmatchIndexUnicode(s, start, doCache)
  183. }
  184. return r.findSubmatchIndexUTF16(s, start, doCache)
  185. }
  186. func (r *regexp2Wrapper) findUTF16Cached(s String, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
  187. wrapped := r.rx
  188. cache := r.cache
  189. if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
  190. runes = cache.runes
  191. } else {
  192. runes = s.utf16Runes()
  193. cache = nil
  194. }
  195. match, err = wrapped.FindRunesMatchStartingAt(runes, start)
  196. if doCache && match != nil && err == nil {
  197. if cache == nil {
  198. if r.cache == nil {
  199. r.cache = new(regexp2MatchCache)
  200. }
  201. *r.cache = regexp2MatchCache{
  202. target: s,
  203. runes: runes,
  204. }
  205. }
  206. } else {
  207. r.cache = nil
  208. }
  209. return
  210. }
  211. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s String, start int, doCache bool) (result []int) {
  212. match, _, err := r.findUTF16Cached(s, start, doCache)
  213. if err != nil {
  214. return
  215. }
  216. if match == nil {
  217. return
  218. }
  219. groups := match.Groups()
  220. result = make([]int, 0, len(groups)<<1)
  221. for _, group := range groups {
  222. if len(group.Captures) > 0 {
  223. result = append(result, group.Index, group.Index+group.Length)
  224. } else {
  225. result = append(result, -1, 0)
  226. }
  227. }
  228. return
  229. }
  230. func (r *regexp2Wrapper) findUnicodeCached(s String, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
  231. var (
  232. runes []rune
  233. mappedStart int
  234. splitPair bool
  235. savedRune rune
  236. )
  237. wrapped := r.rx
  238. cache := r.cache
  239. if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
  240. runes, posMap = cache.runes, cache.posMap
  241. mappedStart, splitPair = posMapReverseLookup(posMap, start)
  242. } else {
  243. posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), start)
  244. cache = nil
  245. }
  246. if splitPair {
  247. // temporarily set the rune at mappedStart to the second code point of the pair
  248. _, second := utf16.EncodeRune(runes[mappedStart])
  249. savedRune, runes[mappedStart] = runes[mappedStart], second
  250. }
  251. match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  252. if doCache && match != nil && err == nil {
  253. if splitPair {
  254. runes[mappedStart] = savedRune
  255. }
  256. if cache == nil {
  257. if r.cache == nil {
  258. r.cache = new(regexp2MatchCache)
  259. }
  260. *r.cache = regexp2MatchCache{
  261. target: s,
  262. runes: runes,
  263. posMap: posMap,
  264. }
  265. }
  266. } else {
  267. r.cache = nil
  268. }
  269. return
  270. }
  271. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s String, start int, doCache bool) (result []int) {
  272. match, posMap, err := r.findUnicodeCached(s, start, doCache)
  273. if match == nil || err != nil {
  274. return
  275. }
  276. groups := match.Groups()
  277. result = make([]int, 0, len(groups)<<1)
  278. for _, group := range groups {
  279. if len(group.Captures) > 0 {
  280. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  281. } else {
  282. result = append(result, -1, 0)
  283. }
  284. }
  285. return
  286. }
  287. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s String, start, limit int, sticky bool) [][]int {
  288. wrapped := r.rx
  289. match, runes, err := r.findUTF16Cached(s, start, false)
  290. if match == nil || err != nil {
  291. return nil
  292. }
  293. if limit < 0 {
  294. limit = len(runes) + 1
  295. }
  296. results := make([][]int, 0, limit)
  297. for match != nil {
  298. groups := match.Groups()
  299. result := make([]int, 0, len(groups)<<1)
  300. for _, group := range groups {
  301. if len(group.Captures) > 0 {
  302. startPos := group.Index
  303. endPos := group.Index + group.Length
  304. result = append(result, startPos, endPos)
  305. } else {
  306. result = append(result, -1, 0)
  307. }
  308. }
  309. if sticky && len(result) > 1 {
  310. if result[0] != start {
  311. break
  312. }
  313. start = result[1]
  314. }
  315. results = append(results, result)
  316. limit--
  317. if limit <= 0 {
  318. break
  319. }
  320. match, err = wrapped.FindNextMatch(match)
  321. if err != nil {
  322. return nil
  323. }
  324. }
  325. return results
  326. }
  327. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
  328. posMap = make([]int, 0, l+1)
  329. curPos := 0
  330. runes = make([]rune, 0, l)
  331. startFound := false
  332. for {
  333. if !startFound {
  334. if curPos == start {
  335. mappedStart = len(runes)
  336. startFound = true
  337. }
  338. if curPos > start {
  339. // start position splits a surrogate pair
  340. mappedStart = len(runes) - 1
  341. splitPair = true
  342. startFound = true
  343. }
  344. }
  345. rn, size, err := rd.ReadRune()
  346. if err != nil {
  347. break
  348. }
  349. runes = append(runes, rn)
  350. posMap = append(posMap, curPos)
  351. curPos += size
  352. }
  353. posMap = append(posMap, curPos)
  354. return
  355. }
  356. func posMapReverseLookup(posMap []int, pos int) (int, bool) {
  357. mapped := sort.SearchInts(posMap, pos)
  358. if mapped < len(posMap) && posMap[mapped] != pos {
  359. return mapped - 1, true
  360. }
  361. return mapped, false
  362. }
  363. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  364. wrapped := r.rx
  365. if limit < 0 {
  366. limit = len(s) + 1
  367. }
  368. results := make([][]int, 0, limit)
  369. match, posMap, err := r.findUnicodeCached(s, start, false)
  370. if err != nil {
  371. return nil
  372. }
  373. for match != nil {
  374. groups := match.Groups()
  375. result := make([]int, 0, len(groups)<<1)
  376. for _, group := range groups {
  377. if len(group.Captures) > 0 {
  378. start := posMap[group.Index]
  379. end := posMap[group.Index+group.Length]
  380. result = append(result, start, end)
  381. } else {
  382. result = append(result, -1, 0)
  383. }
  384. }
  385. if sticky && len(result) > 1 {
  386. if result[0] != start {
  387. break
  388. }
  389. start = result[1]
  390. }
  391. results = append(results, result)
  392. match, err = wrapped.FindNextMatch(match)
  393. if err != nil {
  394. return nil
  395. }
  396. }
  397. return results
  398. }
  399. func (r *regexp2Wrapper) findAllSubmatchIndex(s String, start, limit int, sticky, fullUnicode bool) [][]int {
  400. a, u := devirtualizeString(s)
  401. if u != nil {
  402. if fullUnicode {
  403. return r.findAllSubmatchIndexUnicode(u, start, limit, sticky)
  404. }
  405. return r.findAllSubmatchIndexUTF16(u, start, limit, sticky)
  406. }
  407. return r.findAllSubmatchIndexUTF16(a, start, limit, sticky)
  408. }
  409. func (r *regexp2Wrapper) clone() *regexp2Wrapper {
  410. return &regexp2Wrapper{
  411. rx: r.rx,
  412. }
  413. }
  414. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  415. wrapped := (*regexp.Regexp)(r)
  416. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  417. pos := 0
  418. if sticky {
  419. for i, result := range results {
  420. if len(result) > 1 {
  421. if result[0] != pos {
  422. return results[:i]
  423. }
  424. pos = result[1]
  425. }
  426. }
  427. }
  428. return
  429. }
  430. func (r *regexpWrapper) findSubmatchIndex(s String, fullUnicode bool) []int {
  431. a, u := devirtualizeString(s)
  432. if u != nil {
  433. return r.findSubmatchIndexUnicode(u, fullUnicode)
  434. }
  435. return r.findSubmatchIndexASCII(string(a))
  436. }
  437. func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
  438. wrapped := (*regexp.Regexp)(r)
  439. return wrapped.FindStringSubmatchIndex(s)
  440. }
  441. func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
  442. wrapped := (*regexp.Regexp)(r)
  443. if fullUnicode {
  444. posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader()}, s.Length(), 0)
  445. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  446. for i, item := range res {
  447. if item >= 0 {
  448. res[i] = posMap[item]
  449. }
  450. }
  451. return res
  452. }
  453. return wrapped.FindReaderSubmatchIndex(s.utf16RuneReader())
  454. }
  455. func (r *regexpWrapper) clone() *regexpWrapper {
  456. return r
  457. }
  458. func (r *regexpObject) execResultToArray(target String, result []int) Value {
  459. captureCount := len(result) >> 1
  460. valueArray := make([]Value, captureCount)
  461. matchIndex := result[0]
  462. valueArray[0] = target.Substring(result[0], result[1])
  463. lowerBound := 0
  464. for index := 1; index < captureCount; index++ {
  465. offset := index << 1
  466. if result[offset] >= 0 && result[offset+1] >= lowerBound {
  467. valueArray[index] = target.Substring(result[offset], result[offset+1])
  468. lowerBound = result[offset]
  469. } else {
  470. valueArray[index] = _undefined
  471. }
  472. }
  473. match := r.val.runtime.newArrayValues(valueArray)
  474. match.self.setOwnStr("input", target, false)
  475. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  476. return match
  477. }
  478. func (r *regexpObject) getLastIndex() int64 {
  479. lastIndex := toLength(r.getStr("lastIndex", nil))
  480. if !r.pattern.global && !r.pattern.sticky {
  481. return 0
  482. }
  483. return lastIndex
  484. }
  485. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  486. if r.pattern.sticky {
  487. if firstResult == nil || int64(firstResult[0]) != index {
  488. r.setOwnStr("lastIndex", intToValue(0), true)
  489. return false
  490. }
  491. } else {
  492. if firstResult == nil {
  493. if r.pattern.global {
  494. r.setOwnStr("lastIndex", intToValue(0), true)
  495. }
  496. return false
  497. }
  498. }
  499. if r.pattern.global || r.pattern.sticky {
  500. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  501. }
  502. return true
  503. }
  504. func (r *regexpObject) execRegexp(target String) (match bool, result []int) {
  505. index := r.getLastIndex()
  506. if index >= 0 && index <= int64(target.Length()) {
  507. result = r.pattern.findSubmatchIndex(target, int(index))
  508. }
  509. match = r.updateLastIndex(index, result, result)
  510. return
  511. }
  512. func (r *regexpObject) exec(target String) Value {
  513. match, result := r.execRegexp(target)
  514. if match {
  515. return r.execResultToArray(target, result)
  516. }
  517. return _null
  518. }
  519. func (r *regexpObject) test(target String) bool {
  520. match, _ := r.execRegexp(target)
  521. return match
  522. }
  523. func (r *regexpObject) clone() *regexpObject {
  524. r1 := r.val.runtime.newRegexpObject(r.prototype)
  525. r1.source = r.source
  526. r1.pattern = r.pattern
  527. return r1
  528. }
  529. func (r *regexpObject) init() {
  530. r.baseObject.init()
  531. r.standard = true
  532. r._putProp("lastIndex", intToValue(0), true, false, false)
  533. }
  534. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  535. res := r.baseObject.setProto(proto, throw)
  536. if res {
  537. r.standard = false
  538. }
  539. return res
  540. }
  541. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  542. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  543. if res {
  544. r.standard = false
  545. }
  546. return res
  547. }
  548. func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
  549. res := r.baseObject.defineOwnPropertySym(name, desc, throw)
  550. if res && r.standard {
  551. switch name {
  552. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  553. r.standard = false
  554. }
  555. }
  556. return res
  557. }
  558. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  559. res := r.baseObject.deleteStr(name, throw)
  560. if res {
  561. r.standard = false
  562. }
  563. return res
  564. }
  565. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  566. res := r.baseObject.setOwnStr(name, value, throw)
  567. if res && r.standard && name == "exec" {
  568. r.standard = false
  569. }
  570. return res
  571. }
  572. func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
  573. res := r.baseObject.setOwnSym(name, value, throw)
  574. if res && r.standard {
  575. switch name {
  576. case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
  577. r.standard = false
  578. }
  579. }
  580. return res
  581. }