regexp.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2MatchCache struct {
  13. target valueString
  14. runes []rune
  15. posMap []int
  16. }
  17. type regexp2Wrapper struct {
  18. rx *regexp2.Regexp
  19. cache *regexp2MatchCache
  20. }
  21. type regexpWrapper regexp.Regexp
  22. type positionMapItem struct {
  23. src, dst int
  24. }
  25. type positionMap []positionMapItem
  26. func (m positionMap) get(src int) int {
  27. if src == 0 {
  28. return 0
  29. }
  30. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  31. if res >= len(m) || m[res].src != src {
  32. panic("index not found")
  33. }
  34. return m[res].dst
  35. }
  36. type arrayRuneReader struct {
  37. runes []rune
  38. pos int
  39. }
  40. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  41. if rd.pos < len(rd.runes) {
  42. r = rd.runes[rd.pos]
  43. size = 1
  44. rd.pos++
  45. } else {
  46. err = io.EOF
  47. }
  48. return
  49. }
  50. type regexpPattern struct {
  51. src string
  52. global, ignoreCase, multiline, sticky, unicode bool
  53. regexpWrapper *regexpWrapper
  54. regexp2Wrapper *regexp2Wrapper
  55. }
  56. func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
  57. var opts regexp2.RegexOptions = regexp2.ECMAScript
  58. if multiline {
  59. opts |= regexp2.Multiline
  60. }
  61. if ignoreCase {
  62. opts |= regexp2.IgnoreCase
  63. }
  64. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  65. if err1 != nil {
  66. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  67. }
  68. return &regexp2Wrapper{rx: regexp2Pattern}, nil
  69. }
  70. func (p *regexpPattern) createRegexp2() {
  71. if p.regexp2Wrapper != nil {
  72. return
  73. }
  74. rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
  75. if err != nil {
  76. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  77. panic(err)
  78. }
  79. p.regexp2Wrapper = rx
  80. }
  81. func buildUTF8PosMap(s valueString) (positionMap, string) {
  82. pm := make(positionMap, 0, s.length())
  83. rd := s.reader(0)
  84. sPos, utf8Pos := 0, 0
  85. var sb strings.Builder
  86. for {
  87. r, size, err := rd.ReadRune()
  88. if err == io.EOF {
  89. break
  90. }
  91. if err != nil {
  92. // the string contains invalid UTF-16, bailing out
  93. return nil, ""
  94. }
  95. utf8Size, _ := sb.WriteRune(r)
  96. sPos += size
  97. utf8Pos += utf8Size
  98. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  99. }
  100. return pm, sb.String()
  101. }
  102. func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
  103. if p.regexpWrapper == nil {
  104. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  105. }
  106. if start != 0 {
  107. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  108. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  109. // work correctly.
  110. p.createRegexp2()
  111. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
  112. }
  113. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  114. }
  115. func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int {
  116. if p.regexpWrapper == nil {
  117. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  118. }
  119. if start == 0 {
  120. if s, ok := s.(asciiString); ok {
  121. return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky)
  122. }
  123. if limit == 1 {
  124. result := p.regexpWrapper.findSubmatchIndexUnicode(s.(unicodeString), p.unicode)
  125. if result == nil {
  126. return nil
  127. }
  128. return [][]int{result}
  129. }
  130. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  131. // input.
  132. if p.unicode {
  133. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  134. pm, str := buildUTF8PosMap(s)
  135. if pm != nil {
  136. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  137. for _, result := range res {
  138. for i, idx := range result {
  139. result[i] = pm.get(idx)
  140. }
  141. }
  142. return res
  143. }
  144. }
  145. }
  146. p.createRegexp2()
  147. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  148. }
  149. type regexpObject struct {
  150. baseObject
  151. pattern *regexpPattern
  152. source valueString
  153. standard bool
  154. }
  155. func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) {
  156. if fullUnicode {
  157. return r.findSubmatchIndexUnicode(s, start, doCache)
  158. }
  159. return r.findSubmatchIndexUTF16(s, start, doCache)
  160. }
  161. func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
  162. wrapped := r.rx
  163. cache := r.cache
  164. if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
  165. runes = cache.runes
  166. } else {
  167. runes = s.utf16Runes()
  168. cache = nil
  169. }
  170. match, err = wrapped.FindRunesMatchStartingAt(runes, start)
  171. if doCache && match != nil && err == nil {
  172. if cache == nil {
  173. if r.cache == nil {
  174. r.cache = new(regexp2MatchCache)
  175. }
  176. *r.cache = regexp2MatchCache{
  177. target: s,
  178. runes: runes,
  179. }
  180. }
  181. } else {
  182. r.cache = nil
  183. }
  184. return
  185. }
  186. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) {
  187. match, _, err := r.findUTF16Cached(s, start, doCache)
  188. if err != nil {
  189. return
  190. }
  191. if match == nil {
  192. return
  193. }
  194. groups := match.Groups()
  195. result = make([]int, 0, len(groups)<<1)
  196. for _, group := range groups {
  197. if len(group.Captures) > 0 {
  198. result = append(result, group.Index, group.Index+group.Length)
  199. } else {
  200. result = append(result, -1, 0)
  201. }
  202. }
  203. return
  204. }
  205. func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
  206. var (
  207. runes []rune
  208. mappedStart int
  209. splitPair bool
  210. savedRune rune
  211. )
  212. wrapped := r.rx
  213. cache := r.cache
  214. if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
  215. runes, posMap = cache.runes, cache.posMap
  216. mappedStart, splitPair = posMapReverseLookup(posMap, start)
  217. } else {
  218. posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
  219. cache = nil
  220. }
  221. if splitPair {
  222. // temporarily set the rune at mappedStart to the second code point of the pair
  223. _, second := utf16.EncodeRune(runes[mappedStart])
  224. savedRune, runes[mappedStart] = runes[mappedStart], second
  225. }
  226. match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  227. if doCache && match != nil && err == nil {
  228. if splitPair {
  229. runes[mappedStart] = savedRune
  230. }
  231. if cache == nil {
  232. if r.cache == nil {
  233. r.cache = new(regexp2MatchCache)
  234. }
  235. *r.cache = regexp2MatchCache{
  236. target: s,
  237. runes: runes,
  238. posMap: posMap,
  239. }
  240. }
  241. } else {
  242. r.cache = nil
  243. }
  244. return
  245. }
  246. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) {
  247. match, posMap, err := r.findUnicodeCached(s, start, doCache)
  248. if match == nil || err != nil {
  249. return
  250. }
  251. groups := match.Groups()
  252. result = make([]int, 0, len(groups)<<1)
  253. for _, group := range groups {
  254. if len(group.Captures) > 0 {
  255. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  256. } else {
  257. result = append(result, -1, 0)
  258. }
  259. }
  260. return
  261. }
  262. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
  263. wrapped := r.rx
  264. match, runes, err := r.findUTF16Cached(s, start, false)
  265. if match == nil || err != nil {
  266. return nil
  267. }
  268. if limit < 0 {
  269. limit = len(runes) + 1
  270. }
  271. results := make([][]int, 0, limit)
  272. for match != nil {
  273. groups := match.Groups()
  274. result := make([]int, 0, len(groups)<<1)
  275. for _, group := range groups {
  276. if len(group.Captures) > 0 {
  277. startPos := group.Index
  278. endPos := group.Index + group.Length
  279. result = append(result, startPos, endPos)
  280. } else {
  281. result = append(result, -1, 0)
  282. }
  283. }
  284. if sticky && len(result) > 1 {
  285. if result[0] != start {
  286. break
  287. }
  288. start = result[1]
  289. }
  290. results = append(results, result)
  291. limit--
  292. if limit <= 0 {
  293. break
  294. }
  295. match, err = wrapped.FindNextMatch(match)
  296. if err != nil {
  297. return nil
  298. }
  299. }
  300. return results
  301. }
  302. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
  303. posMap = make([]int, 0, l+1)
  304. curPos := 0
  305. runes = make([]rune, 0, l)
  306. startFound := false
  307. for {
  308. if !startFound {
  309. if curPos == start {
  310. mappedStart = len(runes)
  311. startFound = true
  312. }
  313. if curPos > start {
  314. // start position splits a surrogate pair
  315. mappedStart = len(runes) - 1
  316. splitPair = true
  317. startFound = true
  318. }
  319. }
  320. rn, size, err := rd.ReadRune()
  321. if err != nil {
  322. break
  323. }
  324. runes = append(runes, rn)
  325. posMap = append(posMap, curPos)
  326. curPos += size
  327. }
  328. posMap = append(posMap, curPos)
  329. return
  330. }
  331. func posMapReverseLookup(posMap []int, pos int) (int, bool) {
  332. mapped := sort.SearchInts(posMap, pos)
  333. if mapped < len(posMap) && posMap[mapped] != pos {
  334. return mapped - 1, true
  335. }
  336. return mapped, false
  337. }
  338. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  339. wrapped := r.rx
  340. if limit < 0 {
  341. limit = len(s) + 1
  342. }
  343. results := make([][]int, 0, limit)
  344. match, posMap, err := r.findUnicodeCached(s, start, false)
  345. if err != nil {
  346. return nil
  347. }
  348. for match != nil {
  349. groups := match.Groups()
  350. result := make([]int, 0, len(groups)<<1)
  351. for _, group := range groups {
  352. if len(group.Captures) > 0 {
  353. start := posMap[group.Index]
  354. end := posMap[group.Index+group.Length]
  355. result = append(result, start, end)
  356. } else {
  357. result = append(result, -1, 0)
  358. }
  359. }
  360. if sticky && len(result) > 1 {
  361. if result[0] != start {
  362. break
  363. }
  364. start = result[1]
  365. }
  366. results = append(results, result)
  367. match, err = wrapped.FindNextMatch(match)
  368. if err != nil {
  369. return nil
  370. }
  371. }
  372. return results
  373. }
  374. func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int {
  375. switch s := s.(type) {
  376. case asciiString:
  377. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  378. case unicodeString:
  379. if fullUnicode {
  380. return r.findAllSubmatchIndexUnicode(s, start, limit, sticky)
  381. }
  382. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  383. default:
  384. panic("Unsupported string type")
  385. }
  386. }
  387. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  388. wrapped := (*regexp.Regexp)(r)
  389. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  390. pos := 0
  391. if sticky {
  392. for i, result := range results {
  393. if len(result) > 1 {
  394. if result[0] != pos {
  395. return results[:i]
  396. }
  397. pos = result[1]
  398. }
  399. }
  400. }
  401. return
  402. }
  403. func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int {
  404. switch s := s.(type) {
  405. case asciiString:
  406. return r.findSubmatchIndexASCII(string(s))
  407. case unicodeString:
  408. return r.findSubmatchIndexUnicode(s, fullUnicode)
  409. default:
  410. panic("Unsupported string type")
  411. }
  412. }
  413. func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
  414. wrapped := (*regexp.Regexp)(r)
  415. return wrapped.FindStringSubmatchIndex(s)
  416. }
  417. func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
  418. wrapped := (*regexp.Regexp)(r)
  419. if fullUnicode {
  420. posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
  421. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  422. for i, item := range res {
  423. res[i] = posMap[item]
  424. }
  425. return res
  426. }
  427. return wrapped.FindReaderSubmatchIndex(s.utf16Reader(0))
  428. }
  429. func (r *regexpObject) execResultToArray(target valueString, result []int) Value {
  430. captureCount := len(result) >> 1
  431. valueArray := make([]Value, captureCount)
  432. matchIndex := result[0]
  433. lowerBound := matchIndex
  434. for index := 0; index < captureCount; index++ {
  435. offset := index << 1
  436. if result[offset] >= lowerBound {
  437. valueArray[index] = target.substring(result[offset], result[offset+1])
  438. lowerBound = result[offset]
  439. } else {
  440. valueArray[index] = _undefined
  441. }
  442. }
  443. match := r.val.runtime.newArrayValues(valueArray)
  444. match.self.setOwnStr("input", target, false)
  445. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  446. return match
  447. }
  448. func (r *regexpObject) getLastIndex() int64 {
  449. lastIndex := toLength(r.getStr("lastIndex", nil))
  450. if !r.pattern.global && !r.pattern.sticky {
  451. return 0
  452. }
  453. return lastIndex
  454. }
  455. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  456. if r.pattern.sticky {
  457. if firstResult == nil || int64(firstResult[0]) != index {
  458. r.setOwnStr("lastIndex", intToValue(0), true)
  459. return false
  460. }
  461. } else {
  462. if firstResult == nil {
  463. if r.pattern.global {
  464. r.setOwnStr("lastIndex", intToValue(0), true)
  465. }
  466. return false
  467. }
  468. }
  469. if r.pattern.global || r.pattern.sticky {
  470. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  471. }
  472. return true
  473. }
  474. func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) {
  475. index := r.getLastIndex()
  476. if index >= 0 && index <= int64(target.length()) {
  477. result = r.pattern.findSubmatchIndex(target, int(index))
  478. }
  479. match = r.updateLastIndex(index, result, result)
  480. return
  481. }
  482. func (r *regexpObject) exec(target valueString) Value {
  483. match, result := r.execRegexp(target)
  484. if match {
  485. return r.execResultToArray(target, result)
  486. }
  487. return _null
  488. }
  489. func (r *regexpObject) test(target valueString) bool {
  490. match, _ := r.execRegexp(target)
  491. return match
  492. }
  493. func (r *regexpObject) clone() *Object {
  494. r1 := r.val.runtime.newRegexpObject(r.prototype)
  495. r1.source = r.source
  496. r1.pattern = r.pattern
  497. return r1.val
  498. }
  499. func (r *regexpObject) init() {
  500. r.baseObject.init()
  501. r.standard = true
  502. r._putProp("lastIndex", intToValue(0), true, false, false)
  503. }
  504. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  505. res := r.baseObject.setProto(proto, throw)
  506. if res {
  507. r.standard = false
  508. }
  509. return res
  510. }
  511. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  512. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  513. if res {
  514. r.standard = false
  515. }
  516. return res
  517. }
  518. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  519. res := r.baseObject.deleteStr(name, throw)
  520. if res {
  521. r.standard = false
  522. }
  523. return res
  524. }
  525. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  526. if r.standard {
  527. if name == "exec" {
  528. res := r.baseObject.setOwnStr(name, value, throw)
  529. if res {
  530. r.standard = false
  531. }
  532. return res
  533. }
  534. }
  535. return r.baseObject.setOwnStr(name, value, throw)
  536. }