regexp.go 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. package parser
  2. import (
  3. "bytes"
  4. "fmt"
  5. "strconv"
  6. "strings"
  7. )
  8. const (
  9. WhitespaceChars = " \f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff"
  10. )
  11. type _RegExp_parser struct {
  12. str string
  13. length int
  14. chr rune // The current character
  15. chrOffset int // The offset of current character
  16. offset int // The offset after current character (may be greater than 1)
  17. errors []error
  18. invalid bool // The input is an invalid JavaScript RegExp
  19. goRegexp *bytes.Buffer
  20. }
  21. // TransformRegExp transforms a JavaScript pattern into a Go "regexp" pattern.
  22. //
  23. // re2 (Go) cannot do backtracking, so the presence of a lookahead (?=) (?!) or
  24. // backreference (\1, \2, ...) will cause an error.
  25. //
  26. // re2 (Go) has a different definition for \s: [\t\n\f\r ].
  27. // The JavaScript definition, on the other hand, also includes \v, Unicode "Separator, Space", etc.
  28. //
  29. // If the pattern is invalid (not valid even in JavaScript), then this function
  30. // returns the empty string and an error.
  31. //
  32. // If the pattern is valid, but incompatible (contains a lookahead or backreference),
  33. // then this function returns the transformation (a non-empty string) AND an error.
  34. func TransformRegExp(pattern string) (string, error) {
  35. if pattern == "" {
  36. return "", nil
  37. }
  38. // TODO If without \, if without (?=, (?!, then another shortcut
  39. parser := _RegExp_parser{
  40. str: pattern,
  41. length: len(pattern),
  42. goRegexp: bytes.NewBuffer(make([]byte, 0, 3*len(pattern)/2)),
  43. }
  44. parser.read() // Pull in the first character
  45. parser.scan()
  46. var err error
  47. if len(parser.errors) > 0 {
  48. err = parser.errors[0]
  49. }
  50. if parser.invalid {
  51. return "", err
  52. }
  53. // Might not be re2 compatible, but is still a valid JavaScript RegExp
  54. return parser.goRegexp.String(), err
  55. }
  56. func (self *_RegExp_parser) scan() {
  57. for self.chr != -1 {
  58. switch self.chr {
  59. case '\\':
  60. self.read()
  61. self.scanEscape(false)
  62. case '(':
  63. self.pass()
  64. self.scanGroup()
  65. case '[':
  66. self.scanBracket()
  67. case ')':
  68. self.error(-1, "Unmatched ')'")
  69. self.invalid = true
  70. self.pass()
  71. case '.':
  72. self.goRegexp.WriteString("[^\\r\\n]")
  73. self.read()
  74. default:
  75. self.pass()
  76. }
  77. }
  78. }
  79. // (...)
  80. func (self *_RegExp_parser) scanGroup() {
  81. str := self.str[self.chrOffset:]
  82. if len(str) > 1 { // A possibility of (?= or (?!
  83. if str[0] == '?' {
  84. if str[1] == '=' || str[1] == '!' {
  85. self.error(-1, "re2: Invalid (%s) <lookahead>", self.str[self.chrOffset:self.chrOffset+2])
  86. }
  87. }
  88. }
  89. for self.chr != -1 && self.chr != ')' {
  90. switch self.chr {
  91. case '\\':
  92. self.read()
  93. self.scanEscape(false)
  94. case '(':
  95. self.pass()
  96. self.scanGroup()
  97. case '[':
  98. self.scanBracket()
  99. case '.':
  100. self.goRegexp.WriteString("[^\\r\\n]")
  101. self.read()
  102. default:
  103. self.pass()
  104. continue
  105. }
  106. }
  107. if self.chr != ')' {
  108. self.error(-1, "Unterminated group")
  109. self.invalid = true
  110. return
  111. }
  112. self.pass()
  113. }
  114. // [...]
  115. func (self *_RegExp_parser) scanBracket() {
  116. str := self.str[self.chrOffset:]
  117. if strings.HasPrefix(str, "[]") {
  118. // [] -- Empty character class
  119. self.goRegexp.WriteString("[^\u0000-uffff]")
  120. self.offset += 1
  121. self.read()
  122. return
  123. }
  124. if strings.HasPrefix(str, "[^]") {
  125. self.goRegexp.WriteString("[\u0000-\uffff]")
  126. self.offset += 2
  127. self.read()
  128. return
  129. }
  130. self.pass()
  131. for self.chr != -1 {
  132. if self.chr == ']' {
  133. break
  134. } else if self.chr == '\\' {
  135. self.read()
  136. self.scanEscape(true)
  137. continue
  138. }
  139. self.pass()
  140. }
  141. if self.chr != ']' {
  142. self.error(-1, "Unterminated character class")
  143. self.invalid = true
  144. return
  145. }
  146. self.pass()
  147. }
  148. // \...
  149. func (self *_RegExp_parser) scanEscape(inClass bool) {
  150. offset := self.chrOffset
  151. var length, base uint32
  152. switch self.chr {
  153. case '0', '1', '2', '3', '4', '5', '6', '7':
  154. var value int64
  155. size := 0
  156. for {
  157. digit := int64(digitValue(self.chr))
  158. if digit >= 8 {
  159. // Not a valid digit
  160. break
  161. }
  162. value = value*8 + digit
  163. self.read()
  164. size += 1
  165. }
  166. if size == 1 { // The number of characters read
  167. _, err := self.goRegexp.Write([]byte{'\\', byte(value) + '0'})
  168. if err != nil {
  169. self.errors = append(self.errors, err)
  170. }
  171. if value != 0 {
  172. // An invalid backreference
  173. self.error(-1, "re2: Invalid \\%d <backreference>", value)
  174. }
  175. return
  176. }
  177. tmp := []byte{'\\', 'x', '0', 0}
  178. if value >= 16 {
  179. tmp = tmp[0:2]
  180. } else {
  181. tmp = tmp[0:3]
  182. }
  183. tmp = strconv.AppendInt(tmp, value, 16)
  184. _, err := self.goRegexp.Write(tmp)
  185. if err != nil {
  186. self.errors = append(self.errors, err)
  187. }
  188. return
  189. case '8', '9':
  190. size := 0
  191. for {
  192. digit := digitValue(self.chr)
  193. if digit >= 10 {
  194. // Not a valid digit
  195. break
  196. }
  197. self.read()
  198. size += 1
  199. }
  200. err := self.goRegexp.WriteByte('\\')
  201. if err != nil {
  202. self.errors = append(self.errors, err)
  203. }
  204. _, err = self.goRegexp.WriteString(self.str[offset:self.chrOffset])
  205. if err != nil {
  206. self.errors = append(self.errors, err)
  207. }
  208. self.error(-1, "re2: Invalid \\%s <backreference>", self.str[offset:self.chrOffset])
  209. return
  210. case 'x':
  211. self.read()
  212. length, base = 2, 16
  213. case 'u':
  214. self.read()
  215. length, base = 4, 16
  216. case 'b':
  217. if inClass {
  218. _, err := self.goRegexp.Write([]byte{'\\', 'x', '0', '8'})
  219. if err != nil {
  220. self.errors = append(self.errors, err)
  221. }
  222. self.read()
  223. return
  224. }
  225. fallthrough
  226. case 'B':
  227. fallthrough
  228. case 'd', 'D', 'w', 'W':
  229. // This is slightly broken, because ECMAScript
  230. // includes \v in \s, \S, while re2 does not
  231. fallthrough
  232. case '\\':
  233. fallthrough
  234. case 'f', 'n', 'r', 't', 'v':
  235. err := self.goRegexp.WriteByte('\\')
  236. if err != nil {
  237. self.errors = append(self.errors, err)
  238. }
  239. self.pass()
  240. return
  241. case 'c':
  242. self.read()
  243. var value int64
  244. if 'a' <= self.chr && self.chr <= 'z' {
  245. value = int64(self.chr) - 'a' + 1
  246. } else if 'A' <= self.chr && self.chr <= 'Z' {
  247. value = int64(self.chr) - 'A' + 1
  248. } else {
  249. err := self.goRegexp.WriteByte('c')
  250. if err != nil {
  251. self.errors = append(self.errors, err)
  252. }
  253. return
  254. }
  255. tmp := []byte{'\\', 'x', '0', 0}
  256. if value >= 16 {
  257. tmp = tmp[0:2]
  258. } else {
  259. tmp = tmp[0:3]
  260. }
  261. tmp = strconv.AppendInt(tmp, value, 16)
  262. _, err := self.goRegexp.Write(tmp)
  263. if err != nil {
  264. self.errors = append(self.errors, err)
  265. }
  266. self.read()
  267. return
  268. case 's':
  269. if inClass {
  270. self.goRegexp.WriteString(WhitespaceChars)
  271. } else {
  272. self.goRegexp.WriteString("[" + WhitespaceChars + "]")
  273. }
  274. self.read()
  275. return
  276. case 'S':
  277. if inClass {
  278. self.error(self.chrOffset, "S in class")
  279. self.invalid = true
  280. return
  281. } else {
  282. self.goRegexp.WriteString("[^" + WhitespaceChars + "]")
  283. }
  284. self.read()
  285. return
  286. default:
  287. // $ is an identifier character, so we have to have
  288. // a special case for it here
  289. if self.chr == '$' || !isIdentifierPart(self.chr) {
  290. // A non-identifier character needs escaping
  291. err := self.goRegexp.WriteByte('\\')
  292. if err != nil {
  293. self.errors = append(self.errors, err)
  294. }
  295. } else {
  296. // Unescape the character for re2
  297. }
  298. self.pass()
  299. return
  300. }
  301. // Otherwise, we're a \u.... or \x...
  302. valueOffset := self.chrOffset
  303. var value uint32
  304. {
  305. length := length
  306. for ; length > 0; length-- {
  307. digit := uint32(digitValue(self.chr))
  308. if digit >= base {
  309. // Not a valid digit
  310. goto skip
  311. }
  312. value = value*base + digit
  313. self.read()
  314. }
  315. }
  316. if length == 4 {
  317. _, err := self.goRegexp.Write([]byte{
  318. '\\',
  319. 'x',
  320. '{',
  321. self.str[valueOffset+0],
  322. self.str[valueOffset+1],
  323. self.str[valueOffset+2],
  324. self.str[valueOffset+3],
  325. '}',
  326. })
  327. if err != nil {
  328. self.errors = append(self.errors, err)
  329. }
  330. } else if length == 2 {
  331. _, err := self.goRegexp.Write([]byte{
  332. '\\',
  333. 'x',
  334. self.str[valueOffset+0],
  335. self.str[valueOffset+1],
  336. })
  337. if err != nil {
  338. self.errors = append(self.errors, err)
  339. }
  340. } else {
  341. // Should never, ever get here...
  342. self.error(-1, "re2: Illegal branch in scanEscape")
  343. goto skip
  344. }
  345. return
  346. skip:
  347. _, err := self.goRegexp.WriteString(self.str[offset:self.chrOffset])
  348. if err != nil {
  349. self.errors = append(self.errors, err)
  350. }
  351. }
  352. func (self *_RegExp_parser) pass() {
  353. if self.chr != -1 {
  354. _, err := self.goRegexp.WriteRune(self.chr)
  355. if err != nil {
  356. self.errors = append(self.errors, err)
  357. }
  358. }
  359. self.read()
  360. }
  361. // TODO Better error reporting, use the offset, etc.
  362. func (self *_RegExp_parser) error(offset int, msg string, msgValues ...interface{}) error {
  363. err := fmt.Errorf(msg, msgValues...)
  364. self.errors = append(self.errors, err)
  365. return err
  366. }