regexp.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. package goja
  2. import (
  3. "fmt"
  4. "github.com/dlclark/regexp2"
  5. "github.com/dop251/goja/unistring"
  6. "io"
  7. "regexp"
  8. "sort"
  9. "strings"
  10. "unicode/utf16"
  11. )
  12. type regexp2Wrapper regexp2.Regexp
  13. type regexpWrapper regexp.Regexp
  14. type positionMapItem struct {
  15. src, dst int
  16. }
  17. type positionMap []positionMapItem
  18. func (m positionMap) get(src int) int {
  19. if src == 0 {
  20. return 0
  21. }
  22. res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
  23. if res >= len(m) || m[res].src != src {
  24. panic("index not found")
  25. }
  26. return m[res].dst
  27. }
  28. type arrayRuneReader struct {
  29. runes []rune
  30. pos int
  31. }
  32. func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
  33. if rd.pos < len(rd.runes) {
  34. r = rd.runes[rd.pos]
  35. size = 1
  36. rd.pos++
  37. } else {
  38. err = io.EOF
  39. }
  40. return
  41. }
  42. type regexpPattern struct {
  43. src string
  44. global, ignoreCase, multiline, sticky, unicode bool
  45. regexpWrapper *regexpWrapper
  46. regexp2Wrapper *regexp2Wrapper
  47. }
  48. func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
  49. var opts regexp2.RegexOptions = regexp2.ECMAScript
  50. if multiline {
  51. opts |= regexp2.Multiline
  52. }
  53. if ignoreCase {
  54. opts |= regexp2.IgnoreCase
  55. }
  56. regexp2Pattern, err1 := regexp2.Compile(src, opts)
  57. if err1 != nil {
  58. return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
  59. }
  60. return (*regexp2Wrapper)(regexp2Pattern), nil
  61. }
  62. func (p *regexpPattern) createRegexp2() {
  63. if p.regexp2Wrapper != nil {
  64. return
  65. }
  66. rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
  67. if err != nil {
  68. // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
  69. panic(err)
  70. }
  71. p.regexp2Wrapper = rx
  72. }
  73. func buildUTF8PosMap(s valueString) (positionMap, string) {
  74. pm := make(positionMap, 0, s.length())
  75. rd := s.reader(0)
  76. sPos, utf8Pos := 0, 0
  77. var sb strings.Builder
  78. for {
  79. r, size, err := rd.ReadRune()
  80. if err == io.EOF {
  81. break
  82. }
  83. if err != nil {
  84. // the string contains invalid UTF-16, bailing out
  85. return nil, ""
  86. }
  87. utf8Size, _ := sb.WriteRune(r)
  88. sPos += size
  89. utf8Pos += utf8Size
  90. pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
  91. }
  92. return pm, sb.String()
  93. }
  94. func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
  95. if p.regexpWrapper == nil {
  96. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode)
  97. }
  98. if start != 0 {
  99. // Unfortunately Go's regexp library does not allow starting from an arbitrary position.
  100. // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
  101. // work correctly.
  102. p.createRegexp2()
  103. return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode)
  104. }
  105. return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  106. }
  107. func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int {
  108. if p.regexpWrapper == nil {
  109. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  110. }
  111. if start == 0 {
  112. if s, ok := s.(asciiString); ok {
  113. return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky)
  114. }
  115. if limit == 1 {
  116. result := p.regexpWrapper.findSubmatchIndex(s, p.unicode)
  117. if result == nil {
  118. return nil
  119. }
  120. return [][]int{result}
  121. }
  122. // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
  123. // input.
  124. if p.unicode {
  125. // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
  126. pm, str := buildUTF8PosMap(s)
  127. if pm != nil {
  128. res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
  129. for _, result := range res {
  130. for i, idx := range result {
  131. result[i] = pm.get(idx)
  132. }
  133. }
  134. return res
  135. }
  136. }
  137. }
  138. p.createRegexp2()
  139. return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
  140. }
  141. type regexpObject struct {
  142. baseObject
  143. pattern *regexpPattern
  144. source valueString
  145. standard bool
  146. }
  147. func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode bool) (result []int) {
  148. if fullUnicode {
  149. return r.findSubmatchIndexUnicode(s, start)
  150. }
  151. return r.findSubmatchIndexUTF16(s, start)
  152. }
  153. func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int) (result []int) {
  154. wrapped := (*regexp2.Regexp)(r)
  155. match, err := wrapped.FindRunesMatchStartingAt(s.utf16Runes(), start)
  156. if err != nil {
  157. return
  158. }
  159. if match == nil {
  160. return
  161. }
  162. groups := match.Groups()
  163. result = make([]int, 0, len(groups)<<1)
  164. for _, group := range groups {
  165. if len(group.Captures) > 0 {
  166. result = append(result, group.Index, group.Index+group.Length)
  167. } else {
  168. result = append(result, -1, 0)
  169. }
  170. }
  171. return
  172. }
  173. func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int) (result []int) {
  174. wrapped := (*regexp2.Regexp)(r)
  175. posMap, runes, mappedStart := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
  176. match, err := wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  177. if err != nil {
  178. return
  179. }
  180. if match == nil {
  181. return
  182. }
  183. groups := match.Groups()
  184. result = make([]int, 0, len(groups)<<1)
  185. for _, group := range groups {
  186. if len(group.Captures) > 0 {
  187. result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
  188. } else {
  189. result = append(result, -1, 0)
  190. }
  191. }
  192. return
  193. }
  194. func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
  195. wrapped := (*regexp2.Regexp)(r)
  196. runes := s.utf16Runes()
  197. match, err := wrapped.FindRunesMatchStartingAt(runes, start)
  198. if err != nil {
  199. return nil
  200. }
  201. if limit < 0 {
  202. limit = len(runes) + 1
  203. }
  204. results := make([][]int, 0, limit)
  205. for match != nil {
  206. groups := match.Groups()
  207. result := make([]int, 0, len(groups)<<1)
  208. for _, group := range groups {
  209. if len(group.Captures) > 0 {
  210. startPos := group.Index
  211. endPos := group.Index + group.Length
  212. result = append(result, startPos, endPos)
  213. } else {
  214. result = append(result, -1, 0)
  215. }
  216. }
  217. if sticky && len(result) > 1 {
  218. if result[0] != start {
  219. break
  220. }
  221. start = result[1]
  222. }
  223. results = append(results, result)
  224. limit--
  225. if limit <= 0 {
  226. break
  227. }
  228. match, err = wrapped.FindNextMatch(match)
  229. if err != nil {
  230. return nil
  231. }
  232. }
  233. return results
  234. }
  235. func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int) {
  236. posMap = make([]int, 0, l+1)
  237. curPos := 0
  238. runes = make([]rune, 0, l)
  239. startFound := false
  240. for {
  241. if !startFound {
  242. if curPos == start {
  243. mappedStart = len(runes)
  244. startFound = true
  245. }
  246. if curPos > start {
  247. // start position splits a surrogate pair
  248. mappedStart = len(runes) - 1
  249. _, second := utf16.EncodeRune(runes[mappedStart])
  250. runes[mappedStart] = second
  251. startFound = true
  252. }
  253. }
  254. rn, size, err := rd.ReadRune()
  255. if err != nil {
  256. break
  257. }
  258. runes = append(runes, rn)
  259. posMap = append(posMap, curPos)
  260. curPos += size
  261. }
  262. posMap = append(posMap, curPos)
  263. return
  264. }
  265. func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
  266. wrapped := (*regexp2.Regexp)(r)
  267. if limit < 0 {
  268. limit = len(s) + 1
  269. }
  270. results := make([][]int, 0, limit)
  271. posMap, runes, mappedStart := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
  272. match, err := wrapped.FindRunesMatchStartingAt(runes, mappedStart)
  273. if err != nil {
  274. return nil
  275. }
  276. for match != nil {
  277. groups := match.Groups()
  278. result := make([]int, 0, len(groups)<<1)
  279. for _, group := range groups {
  280. if len(group.Captures) > 0 {
  281. start := posMap[group.Index]
  282. end := posMap[group.Index+group.Length]
  283. result = append(result, start, end)
  284. } else {
  285. result = append(result, -1, 0)
  286. }
  287. }
  288. if sticky && len(result) > 1 {
  289. if result[0] != start {
  290. break
  291. }
  292. start = result[1]
  293. }
  294. results = append(results, result)
  295. match, err = wrapped.FindNextMatch(match)
  296. if err != nil {
  297. return nil
  298. }
  299. }
  300. return results
  301. }
  302. func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int {
  303. switch s := s.(type) {
  304. case asciiString:
  305. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  306. case unicodeString:
  307. if fullUnicode {
  308. return r.findAllSubmatchIndexUnicode(s, start, limit, sticky)
  309. }
  310. return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
  311. default:
  312. panic("Unsupported string type")
  313. }
  314. }
  315. func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
  316. wrapped := (*regexp.Regexp)(r)
  317. results = wrapped.FindAllStringSubmatchIndex(s, limit)
  318. pos := 0
  319. if sticky {
  320. for i, result := range results {
  321. if len(result) > 1 {
  322. if result[0] != pos {
  323. return results[:i]
  324. }
  325. pos = result[1]
  326. }
  327. }
  328. }
  329. return
  330. }
  331. func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) (result []int) {
  332. wrapped := (*regexp.Regexp)(r)
  333. if fullUnicode {
  334. posMap, runes, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
  335. res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
  336. for i, item := range res {
  337. res[i] = posMap[item]
  338. }
  339. return res
  340. }
  341. return wrapped.FindReaderSubmatchIndex(s.utf16Reader(0))
  342. }
  343. func (r *regexpObject) execResultToArray(target valueString, result []int) Value {
  344. captureCount := len(result) >> 1
  345. valueArray := make([]Value, captureCount)
  346. matchIndex := result[0]
  347. lowerBound := matchIndex
  348. for index := 0; index < captureCount; index++ {
  349. offset := index << 1
  350. if result[offset] >= lowerBound {
  351. valueArray[index] = target.substring(result[offset], result[offset+1])
  352. lowerBound = result[offset]
  353. } else {
  354. valueArray[index] = _undefined
  355. }
  356. }
  357. match := r.val.runtime.newArrayValues(valueArray)
  358. match.self.setOwnStr("input", target, false)
  359. match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
  360. return match
  361. }
  362. func (r *regexpObject) getLastIndex() int64 {
  363. lastIndex := toLength(r.getStr("lastIndex", nil))
  364. if !r.pattern.global && !r.pattern.sticky {
  365. return 0
  366. }
  367. return lastIndex
  368. }
  369. func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
  370. if r.pattern.sticky {
  371. if firstResult == nil || int64(firstResult[0]) != index {
  372. r.setOwnStr("lastIndex", intToValue(0), true)
  373. return false
  374. }
  375. } else {
  376. if firstResult == nil {
  377. if r.pattern.global {
  378. r.setOwnStr("lastIndex", intToValue(0), true)
  379. }
  380. return false
  381. }
  382. }
  383. if r.pattern.global || r.pattern.sticky {
  384. r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
  385. }
  386. return true
  387. }
  388. func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) {
  389. index := r.getLastIndex()
  390. if index >= 0 && index <= int64(target.length()) {
  391. result = r.pattern.findSubmatchIndex(target, int(index))
  392. }
  393. match = r.updateLastIndex(index, result, result)
  394. return
  395. }
  396. func (r *regexpObject) exec(target valueString) Value {
  397. match, result := r.execRegexp(target)
  398. if match {
  399. return r.execResultToArray(target, result)
  400. }
  401. return _null
  402. }
  403. func (r *regexpObject) test(target valueString) bool {
  404. match, _ := r.execRegexp(target)
  405. return match
  406. }
  407. func (r *regexpObject) clone() *Object {
  408. r1 := r.val.runtime.newRegexpObject(r.prototype)
  409. r1.source = r.source
  410. r1.pattern = r.pattern
  411. return r1.val
  412. }
  413. func (r *regexpObject) init() {
  414. r.baseObject.init()
  415. r.standard = true
  416. r._putProp("lastIndex", intToValue(0), true, false, false)
  417. }
  418. func (r *regexpObject) setProto(proto *Object, throw bool) bool {
  419. res := r.baseObject.setProto(proto, throw)
  420. if res {
  421. r.standard = false
  422. }
  423. return res
  424. }
  425. func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
  426. res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
  427. if res {
  428. r.standard = false
  429. }
  430. return res
  431. }
  432. func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
  433. res := r.baseObject.deleteStr(name, throw)
  434. if res {
  435. r.standard = false
  436. }
  437. return res
  438. }
  439. func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
  440. if r.standard {
  441. if name == "exec" {
  442. res := r.baseObject.setOwnStr(name, value, throw)
  443. if res {
  444. r.standard = false
  445. }
  446. return res
  447. }
  448. }
  449. return r.baseObject.setOwnStr(name, value, throw)
  450. }