parser.odin 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. package encoding_json
  2. import "core:mem"
  3. import "core:unicode/utf8"
  4. import "core:unicode/utf16"
  5. import "core:strconv"
  6. Parser :: struct {
  7. tok: Tokenizer,
  8. prev_token: Token,
  9. curr_token: Token,
  10. spec: Specification,
  11. allocator: mem.Allocator,
  12. parse_integers: bool,
  13. }
  14. make_parser :: proc(data: []byte, spec := DEFAULT_SPECIFICATION, parse_integers := false, allocator := context.allocator) -> Parser {
  15. return make_parser_from_string(string(data), spec, parse_integers, allocator)
  16. }
  17. make_parser_from_string :: proc(data: string, spec := DEFAULT_SPECIFICATION, parse_integers := false, allocator := context.allocator) -> Parser {
  18. p: Parser
  19. p.tok = make_tokenizer(data, spec, parse_integers)
  20. p.spec = spec
  21. p.allocator = allocator
  22. assert(p.allocator.procedure != nil)
  23. advance_token(&p)
  24. return p
  25. }
  26. parse :: proc(data: []byte, spec := DEFAULT_SPECIFICATION, parse_integers := false, allocator := context.allocator, loc := #caller_location) -> (Value, Error) {
  27. return parse_string(string(data), spec, parse_integers, allocator, loc)
  28. }
  29. parse_string :: proc(data: string, spec := DEFAULT_SPECIFICATION, parse_integers := false, allocator := context.allocator, loc := #caller_location) -> (Value, Error) {
  30. context.allocator = allocator
  31. p := make_parser_from_string(data, spec, parse_integers, allocator)
  32. switch p.spec {
  33. case .JSON:
  34. return parse_object(&p, loc)
  35. case .JSON5:
  36. return parse_value(&p, loc)
  37. case .SJSON:
  38. #partial switch p.curr_token.kind {
  39. case .Ident, .String:
  40. return parse_object_body(&p, .EOF, loc)
  41. }
  42. return parse_value(&p, loc)
  43. }
  44. return parse_object(&p, loc)
  45. }
  46. token_end_pos :: proc(tok: Token) -> Pos {
  47. end := tok.pos
  48. end.offset += len(tok.text)
  49. return end
  50. }
  51. advance_token :: proc(p: ^Parser) -> (Token, Error) {
  52. err: Error
  53. p.prev_token = p.curr_token
  54. p.curr_token, err = get_token(&p.tok)
  55. return p.prev_token, err
  56. }
  57. allow_token :: proc(p: ^Parser, kind: Token_Kind) -> bool {
  58. if p.curr_token.kind == kind {
  59. advance_token(p)
  60. return true
  61. }
  62. return false
  63. }
  64. expect_token :: proc(p: ^Parser, kind: Token_Kind) -> Error {
  65. prev := p.curr_token
  66. advance_token(p)
  67. if prev.kind == kind {
  68. return nil
  69. }
  70. return .Unexpected_Token
  71. }
  72. parse_colon :: proc(p: ^Parser) -> (err: Error) {
  73. colon_err := expect_token(p, .Colon)
  74. if colon_err == nil {
  75. return nil
  76. }
  77. return .Expected_Colon_After_Key
  78. }
  79. parse_comma :: proc(p: ^Parser) -> (do_break: bool) {
  80. switch p.spec {
  81. case .JSON5, .MJSON:
  82. if allow_token(p, .Comma) {
  83. return false
  84. }
  85. return false
  86. case .JSON:
  87. if !allow_token(p, .Comma) {
  88. return true
  89. }
  90. }
  91. return false
  92. }
  93. parse_value :: proc(p: ^Parser, loc := #caller_location) -> (value: Value, err: Error) {
  94. err = .None
  95. token := p.curr_token
  96. #partial switch token.kind {
  97. case .Null:
  98. advance_token(p)
  99. value = Null{}
  100. return
  101. case .False:
  102. advance_token(p)
  103. value = Boolean(false)
  104. return
  105. case .True:
  106. advance_token(p)
  107. value = Boolean(true)
  108. return
  109. case .Integer:
  110. advance_token(p)
  111. i, _ := strconv.parse_i64(token.text)
  112. value = Integer(i)
  113. return
  114. case .Float:
  115. advance_token(p)
  116. f, _ := strconv.parse_f64(token.text)
  117. value = Float(f)
  118. return
  119. case .Ident:
  120. if p.spec == .MJSON {
  121. advance_token(p)
  122. return string(token.text), nil
  123. }
  124. case .String:
  125. advance_token(p)
  126. return unquote_string(token, p.spec, p.allocator, loc)
  127. case .Open_Brace:
  128. return parse_object(p, loc)
  129. case .Open_Bracket:
  130. return parse_array(p, loc)
  131. case:
  132. if p.spec != .JSON {
  133. switch {
  134. case allow_token(p, .Infinity):
  135. inf: u64 = 0x7ff0000000000000
  136. if token.text[0] == '-' {
  137. inf = 0xfff0000000000000
  138. }
  139. value = transmute(f64)inf
  140. return
  141. case allow_token(p, .NaN):
  142. nan: u64 = 0x7ff7ffffffffffff
  143. if token.text[0] == '-' {
  144. nan = 0xfff7ffffffffffff
  145. }
  146. value = transmute(f64)nan
  147. return
  148. }
  149. }
  150. }
  151. err = .Unexpected_Token
  152. advance_token(p)
  153. return
  154. }
  155. parse_array :: proc(p: ^Parser, loc := #caller_location) -> (value: Value, err: Error) {
  156. err = .None
  157. expect_token(p, .Open_Bracket) or_return
  158. array: Array
  159. array.allocator = p.allocator
  160. defer if err != nil {
  161. for elem in array {
  162. destroy_value(elem, loc=loc)
  163. }
  164. delete(array, loc)
  165. }
  166. for p.curr_token.kind != .Close_Bracket {
  167. elem := parse_value(p, loc) or_return
  168. append(&array, elem, loc)
  169. if parse_comma(p) {
  170. break
  171. }
  172. }
  173. expect_token(p, .Close_Bracket) or_return
  174. value = array
  175. return
  176. }
  177. @(private)
  178. bytes_make :: proc(size, alignment: int, allocator: mem.Allocator, loc := #caller_location) -> (bytes: []byte, err: Error) {
  179. b, berr := mem.alloc_bytes(size, alignment, allocator, loc)
  180. if berr != nil {
  181. if berr == .Out_Of_Memory {
  182. err = .Out_Of_Memory
  183. } else {
  184. err = .Invalid_Allocator
  185. }
  186. }
  187. bytes = b
  188. return
  189. }
  190. clone_string :: proc(s: string, allocator: mem.Allocator, loc := #caller_location) -> (str: string, err: Error) {
  191. n := len(s)
  192. b := bytes_make(n+1, 1, allocator, loc) or_return
  193. copy(b, s)
  194. if len(b) > n {
  195. b[n] = 0
  196. str = string(b[:n])
  197. }
  198. return
  199. }
  200. parse_object_key :: proc(p: ^Parser, key_allocator: mem.Allocator, loc := #caller_location) -> (key: string, err: Error) {
  201. tok := p.curr_token
  202. if p.spec != .JSON {
  203. if allow_token(p, .Ident) {
  204. return clone_string(tok.text, key_allocator, loc)
  205. }
  206. }
  207. if tok_err := expect_token(p, .String); tok_err != nil {
  208. err = .Expected_String_For_Object_Key
  209. return
  210. }
  211. return unquote_string(tok, p.spec, key_allocator, loc)
  212. }
  213. parse_object_body :: proc(p: ^Parser, end_token: Token_Kind, loc := #caller_location) -> (obj: Object, err: Error) {
  214. obj = make(Object, allocator=p.allocator, loc=loc)
  215. defer if err != nil {
  216. for key, elem in obj {
  217. delete(key, p.allocator, loc)
  218. destroy_value(elem, loc=loc)
  219. }
  220. delete(obj, loc)
  221. }
  222. for p.curr_token.kind != end_token {
  223. key := parse_object_key(p, p.allocator, loc) or_return
  224. parse_colon(p) or_return
  225. elem := parse_value(p, loc) or_return
  226. if key in obj {
  227. err = .Duplicate_Object_Key
  228. delete(key, p.allocator, loc)
  229. return
  230. }
  231. // NOTE(gonz): There are code paths for which this traversal ends up
  232. // inserting empty key/values into the object and for those we do not
  233. // want to allocate anything
  234. if key != "" {
  235. reserve_error := reserve(&obj, len(obj) + 1, loc)
  236. if reserve_error == mem.Allocator_Error.Out_Of_Memory {
  237. return nil, .Out_Of_Memory
  238. }
  239. obj[key] = elem
  240. }
  241. if parse_comma(p) {
  242. break
  243. }
  244. }
  245. return obj, .None
  246. }
  247. parse_object :: proc(p: ^Parser, loc := #caller_location) -> (value: Value, err: Error) {
  248. expect_token(p, .Open_Brace) or_return
  249. obj := parse_object_body(p, .Close_Brace, loc) or_return
  250. expect_token(p, .Close_Brace) or_return
  251. return obj, .None
  252. }
  253. // IMPORTANT NOTE(bill): unquote_string assumes a mostly valid string
  254. unquote_string :: proc(token: Token, spec: Specification, allocator := context.allocator, loc := #caller_location) -> (value: string, err: Error) {
  255. get_u2_rune :: proc(s: string) -> rune {
  256. if len(s) < 4 || s[0] != '\\' || s[1] != 'x' {
  257. return -1
  258. }
  259. r: rune
  260. for c in s[2:4] {
  261. x: rune
  262. switch c {
  263. case '0'..='9': x = c - '0'
  264. case 'a'..='f': x = c - 'a' + 10
  265. case 'A'..='F': x = c - 'A' + 10
  266. case: return -1
  267. }
  268. r = r*16 + x
  269. }
  270. return r
  271. }
  272. get_u4_rune :: proc(s: string) -> rune {
  273. if len(s) < 6 || s[0] != '\\' || s[1] != 'u' {
  274. return -1
  275. }
  276. r: rune
  277. for c in s[2:6] {
  278. x: rune
  279. switch c {
  280. case '0'..='9': x = c - '0'
  281. case 'a'..='f': x = c - 'a' + 10
  282. case 'A'..='F': x = c - 'A' + 10
  283. case: return -1
  284. }
  285. r = r*16 + x
  286. }
  287. return r
  288. }
  289. if token.kind != .String {
  290. return "", nil
  291. }
  292. s := token.text
  293. if len(s) <= 2 {
  294. return "", nil
  295. }
  296. quote := s[0]
  297. if s[0] != s[len(s)-1] {
  298. // Invalid string
  299. return "", nil
  300. }
  301. s = s[1:len(s)-1]
  302. i := 0
  303. for i < len(s) {
  304. c := s[i]
  305. if c == '\\' || c == quote || c < ' ' {
  306. break
  307. }
  308. if c < utf8.RUNE_SELF {
  309. i += 1
  310. continue
  311. }
  312. r, w := utf8.decode_rune_in_string(s[i:])
  313. if r == utf8.RUNE_ERROR && w == 1 {
  314. break
  315. }
  316. i += w
  317. }
  318. if i == len(s) {
  319. return clone_string(s, allocator, loc)
  320. }
  321. b := bytes_make(len(s) + 2*utf8.UTF_MAX, 1, allocator) or_return
  322. w := copy(b, s[0:i])
  323. if len(b) == 0 && allocator.data == nil {
  324. // `unmarshal_count_array` calls us with a nil allocator
  325. return string(b[:w]), nil
  326. }
  327. loop: for i < len(s) {
  328. c := s[i]
  329. switch {
  330. case c == '\\':
  331. i += 1
  332. if i >= len(s) {
  333. break loop
  334. }
  335. switch s[i] {
  336. case: break loop
  337. case '"', '\'', '\\', '/':
  338. b[w] = s[i]
  339. i += 1
  340. w += 1
  341. case 'b':
  342. b[w] = '\b'
  343. i += 1
  344. w += 1
  345. case 'f':
  346. b[w] = '\f'
  347. i += 1
  348. w += 1
  349. case 'r':
  350. b[w] = '\r'
  351. i += 1
  352. w += 1
  353. case 't':
  354. b[w] = '\t'
  355. i += 1
  356. w += 1
  357. case 'n':
  358. b[w] = '\n'
  359. i += 1
  360. w += 1
  361. case 'u':
  362. i -= 1 // Include the \u in the check for sanity sake
  363. r := get_u4_rune(s[i:])
  364. if r < 0 {
  365. break loop
  366. }
  367. i += 6
  368. // If this is a surrogate pair, decode as such by taking the next rune too.
  369. if r >= utf8.SURROGATE_MIN && r <= utf8.SURROGATE_HIGH_MAX && len(s) > i + 2 && s[i:i+2] == "\\u" {
  370. r2 := get_u4_rune(s[i:])
  371. if r2 >= utf8.SURROGATE_LOW_MIN && r2 <= utf8.SURROGATE_MAX {
  372. i += 6
  373. r = utf16.decode_surrogate_pair(r, r2)
  374. }
  375. }
  376. buf, buf_width := utf8.encode_rune(r)
  377. copy(b[w:], buf[:buf_width])
  378. w += buf_width
  379. case '0':
  380. if spec != .JSON {
  381. b[w] = '\x00'
  382. i += 1
  383. w += 1
  384. } else {
  385. break loop
  386. }
  387. case 'v':
  388. if spec != .JSON {
  389. b[w] = '\v'
  390. i += 1
  391. w += 1
  392. } else {
  393. break loop
  394. }
  395. case 'x':
  396. if spec != .JSON {
  397. i -= 1 // Include the \x in the check for sanity sake
  398. r := get_u2_rune(s[i:])
  399. if r < 0 {
  400. break loop
  401. }
  402. i += 4
  403. buf, buf_width := utf8.encode_rune(r)
  404. copy(b[w:], buf[:buf_width])
  405. w += buf_width
  406. } else {
  407. break loop
  408. }
  409. }
  410. case c == quote, c < ' ':
  411. break loop
  412. case c < utf8.RUNE_SELF:
  413. b[w] = c
  414. i += 1
  415. w += 1
  416. case:
  417. r, width := utf8.decode_rune_in_string(s[i:])
  418. i += width
  419. buf, buf_width := utf8.encode_rune(r)
  420. assert(buf_width <= width)
  421. copy(b[w:], buf[:buf_width])
  422. w += buf_width
  423. }
  424. }
  425. return string(b[:w]), nil
  426. }