regexp.go 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. package parser
  2. import (
  3. "fmt"
  4. "strconv"
  5. "strings"
  6. "unicode/utf8"
  7. )
  8. const (
  9. WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff"
  10. Re2Dot = "[^\r\n\u2028\u2029]"
  11. )
  12. type regexpParseError struct {
  13. offset int
  14. err string
  15. }
  16. type RegexpErrorIncompatible struct {
  17. regexpParseError
  18. }
  19. type RegexpSyntaxError struct {
  20. regexpParseError
  21. }
  22. func (s regexpParseError) Error() string {
  23. return s.err
  24. }
  25. type _RegExp_parser struct {
  26. str string
  27. length int
  28. chr rune // The current character
  29. chrOffset int // The offset of current character
  30. offset int // The offset after current character (may be greater than 1)
  31. err error
  32. goRegexp strings.Builder
  33. passOffset int
  34. }
  35. // TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern.
  36. //
  37. // re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or
  38. // backreference (\1, \2, ...) will cause an error.
  39. //
  40. // re2 (Go) has a different definition for \s: [\t\n\f\r ].
  41. // The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc.
  42. //
  43. // If the pattern is valid, but incompatible (contains a lookahead or backreference),
  44. // then this function returns an empty string an error of type RegexpErrorIncompatible.
  45. //
  46. // If the pattern is invalid (not valid even in JavaScript), then this function
  47. // returns an empty string and a generic error.
  48. func TransformRegExp(pattern string) (transformed string, err error) {
  49. if pattern == "" {
  50. return "", nil
  51. }
  52. parser := _RegExp_parser{
  53. str: pattern,
  54. length: len(pattern),
  55. }
  56. err = parser.parse()
  57. if err != nil {
  58. return "", err
  59. }
  60. return parser.ResultString(), nil
  61. }
  62. func (self *_RegExp_parser) ResultString() string {
  63. if self.passOffset != -1 {
  64. return self.str[:self.passOffset]
  65. }
  66. return self.goRegexp.String()
  67. }
  68. func (self *_RegExp_parser) parse() (err error) {
  69. self.read() // Pull in the first character
  70. self.scan()
  71. return self.err
  72. }
  73. func (self *_RegExp_parser) read() {
  74. if self.offset < self.length {
  75. self.chrOffset = self.offset
  76. chr, width := rune(self.str[self.offset]), 1
  77. if chr >= utf8.RuneSelf { // !ASCII
  78. chr, width = utf8.DecodeRuneInString(self.str[self.offset:])
  79. if chr == utf8.RuneError && width == 1 {
  80. self.error(true, "Invalid UTF-8 character")
  81. return
  82. }
  83. }
  84. self.offset += width
  85. self.chr = chr
  86. } else {
  87. self.chrOffset = self.length
  88. self.chr = -1 // EOF
  89. }
  90. }
  91. func (self *_RegExp_parser) stopPassing() {
  92. self.goRegexp.Grow(3 * len(self.str) / 2)
  93. self.goRegexp.WriteString(self.str[:self.passOffset])
  94. self.passOffset = -1
  95. }
  96. func (self *_RegExp_parser) write(p []byte) {
  97. if self.passOffset != -1 {
  98. self.stopPassing()
  99. }
  100. self.goRegexp.Write(p)
  101. }
  102. func (self *_RegExp_parser) writeByte(b byte) {
  103. if self.passOffset != -1 {
  104. self.stopPassing()
  105. }
  106. self.goRegexp.WriteByte(b)
  107. }
  108. func (self *_RegExp_parser) writeString(s string) {
  109. if self.passOffset != -1 {
  110. self.stopPassing()
  111. }
  112. self.goRegexp.WriteString(s)
  113. }
  114. func (self *_RegExp_parser) scan() {
  115. for self.chr != -1 {
  116. switch self.chr {
  117. case '\\':
  118. self.read()
  119. self.scanEscape(false)
  120. case '(':
  121. self.pass()
  122. self.scanGroup()
  123. case '[':
  124. self.scanBracket()
  125. case ')':
  126. self.error(true, "Unmatched ')'")
  127. return
  128. case '.':
  129. self.writeString(Re2Dot)
  130. self.read()
  131. default:
  132. self.pass()
  133. }
  134. }
  135. }
  136. // (...)
  137. func (self *_RegExp_parser) scanGroup() {
  138. str := self.str[self.chrOffset:]
  139. if len(str) > 1 { // A possibility of (?= or (?!
  140. if str[0] == '?' {
  141. ch := str[1]
  142. switch {
  143. case ch == '=' || ch == '!':
  144. self.error(false, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2])
  145. return
  146. case ch == '<':
  147. self.error(false, "re2: Invalid (%s) <lookbehind>", self.str[self.chrOffset:self.chrOffset+2])
  148. return
  149. case ch != ':':
  150. self.error(true, "Invalid group")
  151. return
  152. }
  153. }
  154. }
  155. for self.chr != -1 && self.chr != ')' {
  156. switch self.chr {
  157. case '\\':
  158. self.read()
  159. self.scanEscape(false)
  160. case '(':
  161. self.pass()
  162. self.scanGroup()
  163. case '[':
  164. self.scanBracket()
  165. case '.':
  166. self.writeString(Re2Dot)
  167. self.read()
  168. default:
  169. self.pass()
  170. continue
  171. }
  172. }
  173. if self.chr != ')' {
  174. self.error(true, "Unterminated group")
  175. return
  176. }
  177. self.pass()
  178. }
  179. // [...]
  180. func (self *_RegExp_parser) scanBracket() {
  181. str := self.str[self.chrOffset:]
  182. if strings.HasPrefix(str, "[]") {
  183. // [] -- Empty character class
  184. self.writeString("[^\u0000-\U0001FFFF]")
  185. self.offset += 1
  186. self.read()
  187. return
  188. }
  189. if strings.HasPrefix(str, "[^]") {
  190. self.writeString("[\u0000-\U0001FFFF]")
  191. self.offset += 2
  192. self.read()
  193. return
  194. }
  195. self.pass()
  196. for self.chr != -1 {
  197. if self.chr == ']' {
  198. break
  199. } else if self.chr == '\\' {
  200. self.read()
  201. self.scanEscape(true)
  202. continue
  203. }
  204. self.pass()
  205. }
  206. if self.chr != ']' {
  207. self.error(true, "Unterminated character class")
  208. return
  209. }
  210. self.pass()
  211. }
  212. // \...
  213. func (self *_RegExp_parser) scanEscape(inClass bool) {
  214. offset := self.chrOffset
  215. var length, base uint32
  216. switch self.chr {
  217. case '0', '1', '2', '3', '4', '5', '6', '7':
  218. var value int64
  219. size := 0
  220. for {
  221. digit := int64(digitValue(self.chr))
  222. if digit >= 8 {
  223. // Not a valid digit
  224. break
  225. }
  226. value = value*8 + digit
  227. self.read()
  228. size += 1
  229. }
  230. if size == 1 { // The number of characters read
  231. if value != 0 {
  232. // An invalid backreference
  233. self.error(false, "re2: Invalid \\%d <backreference>", value)
  234. return
  235. }
  236. self.passString(offset-1, self.chrOffset)
  237. return
  238. }
  239. tmp := []byte{'\\', 'x', '0', 0}
  240. if value >= 16 {
  241. tmp = tmp[0:2]
  242. } else {
  243. tmp = tmp[0:3]
  244. }
  245. tmp = strconv.AppendInt(tmp, value, 16)
  246. self.write(tmp)
  247. return
  248. case '8', '9':
  249. self.read()
  250. self.error(false, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset])
  251. return
  252. case 'x':
  253. self.read()
  254. length, base = 2, 16
  255. case 'u':
  256. self.read()
  257. if self.chr == '{' {
  258. self.read()
  259. length, base = 0, 16
  260. } else {
  261. length, base = 4, 16
  262. }
  263. case 'b':
  264. if inClass {
  265. self.write([]byte{'\\', 'x', '0', '8'})
  266. self.read()
  267. return
  268. }
  269. fallthrough
  270. case 'B':
  271. fallthrough
  272. case 'd', 'D', 'w', 'W':
  273. // This is slightly broken, because ECMAScript
  274. // includes \v in \s, \S, while re2 does not
  275. fallthrough
  276. case '\\':
  277. fallthrough
  278. case 'f', 'n', 'r', 't', 'v':
  279. self.passString(offset-1, self.offset)
  280. self.read()
  281. return
  282. case 'c':
  283. self.read()
  284. var value int64
  285. if 'a' <= self.chr && self.chr <= 'z' {
  286. value = int64(self.chr - 'a' + 1)
  287. } else if 'A' <= self.chr && self.chr <= 'Z' {
  288. value = int64(self.chr - 'A' + 1)
  289. } else {
  290. self.writeByte('c')
  291. return
  292. }
  293. tmp := []byte{'\\', 'x', '0', 0}
  294. if value >= 16 {
  295. tmp = tmp[0:2]
  296. } else {
  297. tmp = tmp[0:3]
  298. }
  299. tmp = strconv.AppendInt(tmp, value, 16)
  300. self.write(tmp)
  301. self.read()
  302. return
  303. case 's':
  304. if inClass {
  305. self.writeString(WhitespaceChars)
  306. } else {
  307. self.writeString("[" + WhitespaceChars + "]")
  308. }
  309. self.read()
  310. return
  311. case 'S':
  312. if inClass {
  313. self.error(false, "S in class")
  314. return
  315. } else {
  316. self.writeString("[^" + WhitespaceChars + "]")
  317. }
  318. self.read()
  319. return
  320. default:
  321. // $ is an identifier character, so we have to have
  322. // a special case for it here
  323. if self.chr == '$' || self.chr < utf8.RuneSelf && !isIdentifierPart(self.chr) {
  324. // A non-identifier character needs escaping
  325. self.passString(offset-1, self.offset)
  326. self.read()
  327. return
  328. }
  329. // Unescape the character for re2
  330. self.pass()
  331. return
  332. }
  333. // Otherwise, we're a \u.... or \x...
  334. valueOffset := self.chrOffset
  335. if length > 0 {
  336. for length := length; length > 0; length-- {
  337. digit := uint32(digitValue(self.chr))
  338. if digit >= base {
  339. // Not a valid digit
  340. goto skip
  341. }
  342. self.read()
  343. }
  344. } else {
  345. for self.chr != '}' && self.chr != -1 {
  346. digit := uint32(digitValue(self.chr))
  347. if digit >= base {
  348. // Not a valid digit
  349. goto skip
  350. }
  351. self.read()
  352. }
  353. }
  354. if length == 4 || length == 0 {
  355. self.write([]byte{
  356. '\\',
  357. 'x',
  358. '{',
  359. })
  360. self.passString(valueOffset, self.chrOffset)
  361. if length != 0 {
  362. self.writeByte('}')
  363. }
  364. } else if length == 2 {
  365. self.passString(offset-1, valueOffset+2)
  366. } else {
  367. // Should never, ever get here...
  368. self.error(true, "re2: Illegal branch in scanEscape")
  369. return
  370. }
  371. return
  372. skip:
  373. self.passString(offset, self.chrOffset)
  374. }
  375. func (self *_RegExp_parser) pass() {
  376. if self.passOffset == self.chrOffset {
  377. self.passOffset = self.offset
  378. } else {
  379. if self.passOffset != -1 {
  380. self.stopPassing()
  381. }
  382. if self.chr != -1 {
  383. self.goRegexp.WriteRune(self.chr)
  384. }
  385. }
  386. self.read()
  387. }
  388. func (self *_RegExp_parser) passString(start, end int) {
  389. if self.passOffset == start {
  390. self.passOffset = end
  391. return
  392. }
  393. if self.passOffset != -1 {
  394. self.stopPassing()
  395. }
  396. self.goRegexp.WriteString(self.str[start:end])
  397. }
  398. func (self *_RegExp_parser) error(fatal bool, msg string, msgValues ...interface{}) {
  399. if self.err != nil {
  400. return
  401. }
  402. e := regexpParseError{
  403. offset: self.offset,
  404. err: fmt.Sprintf(msg, msgValues...),
  405. }
  406. if fatal {
  407. self.err = RegexpSyntaxError{e}
  408. } else {
  409. self.err = RegexpErrorIncompatible{e}
  410. }
  411. self.offset = self.length
  412. self.chr = -1
  413. }