tokenizer.odin 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. package json
  2. import "core:unicode/utf8"
  3. Token :: struct {
  4. using pos: Pos,
  5. kind: Kind,
  6. text: string,
  7. }
  8. Kind :: enum {
  9. Invalid,
  10. Null,
  11. False,
  12. True,
  13. Infinity,
  14. NaN,
  15. Ident,
  16. Integer,
  17. Float,
  18. String,
  19. Colon,
  20. Comma,
  21. Open_Brace,
  22. Close_Brace,
  23. Open_Bracket,
  24. Close_Bracket,
  25. }
  26. Tokenizer :: struct {
  27. using pos: Pos,
  28. data: []byte,
  29. r: rune, // current rune
  30. w: int, // current rune width in bytes
  31. curr_line_offset: int,
  32. spec: Specification,
  33. }
  34. make_tokenizer :: proc(data: []byte, spec := Specification.JSON) -> Tokenizer {
  35. t := Tokenizer{pos = {line=1}, data = data, spec = spec};
  36. next_rune(&t);
  37. if t.r == utf8.RUNE_BOM {
  38. next_rune(&t);
  39. }
  40. return t;
  41. }
  42. next_rune :: proc(t: ^Tokenizer) -> rune #no_bounds_check {
  43. if t.offset >= len(t.data) {
  44. return utf8.RUNE_EOF;
  45. }
  46. t.offset += t.w;
  47. t.r, t.w = utf8.decode_rune(t.data[t.offset:]);
  48. t.pos.column = t.offset - t.curr_line_offset;
  49. return t.r;
  50. }
  51. get_token :: proc(t: ^Tokenizer) -> (token: Token, err: Error) {
  52. skip_digits :: proc(t: ^Tokenizer) {
  53. for t.offset < len(t.data) {
  54. if '0' <= t.r && t.r <= '9' {
  55. // Okay
  56. } else {
  57. return;
  58. }
  59. next_rune(t);
  60. }
  61. }
  62. skip_hex_digits :: proc(t: ^Tokenizer) {
  63. for t.offset < len(t.data) {
  64. next_rune(t);
  65. switch t.r {
  66. case '0'..'9', 'a'..'f', 'A'..'F':
  67. // Okay
  68. case:
  69. return;
  70. }
  71. }
  72. }
  73. scan_espace :: proc(t: ^Tokenizer) -> bool {
  74. switch t.r {
  75. case '"', '\'', '\\', '/', 'b', 'n', 'r', 't', 'f':
  76. next_rune(t);
  77. return true;
  78. case 'u':
  79. // Expect 4 hexadecimal digits
  80. for i := 0; i < 4; i += 1 {
  81. r := next_rune(t);
  82. switch r {
  83. case '0'..'9', 'a'..'f', 'A'..'F':
  84. // Okay
  85. case:
  86. return false;
  87. }
  88. }
  89. case:
  90. // Ignore the next rune regardless
  91. next_rune(t);
  92. }
  93. return false;
  94. }
  95. skip_whitespace :: proc(t: ^Tokenizer) -> rune {
  96. loop: for t.offset < len(t.data) {
  97. switch t.r {
  98. case ' ', '\t', '\v', '\f', '\r':
  99. next_rune(t);
  100. case '\n':
  101. t.line += 1;
  102. t.curr_line_offset = t.offset;
  103. t.pos.column = 1;
  104. next_rune(t);
  105. case:
  106. if t.spec == Specification.JSON5 {
  107. switch t.r {
  108. case 0x2028, 0x2029, 0xFEFF:
  109. next_rune(t);
  110. continue loop;
  111. }
  112. }
  113. break loop;
  114. }
  115. }
  116. return t.r;
  117. }
  118. skip_to_next_line :: proc(t: ^Tokenizer) {
  119. for t.offset < len(t.data) {
  120. r := next_rune(t);
  121. if r == '\n' {
  122. return;
  123. }
  124. }
  125. }
  126. skip_alphanum :: proc(t: ^Tokenizer) {
  127. for t.offset < len(t.data) {
  128. switch next_rune(t) {
  129. case 'A'..'Z', 'a'..'z', '0'..'9', '_':
  130. continue;
  131. }
  132. return;
  133. }
  134. }
  135. skip_whitespace(t);
  136. token.pos = t.pos;
  137. token.kind = Kind.Invalid;
  138. curr_rune := t.r;
  139. next_rune(t);
  140. block: switch curr_rune {
  141. case utf8.RUNE_ERROR:
  142. err = Error.Illegal_Character;
  143. case utf8.RUNE_EOF, '\x00':
  144. err = Error.EOF;
  145. case 'A'..'Z', 'a'..'z', '_':
  146. token.kind = Kind.Ident;
  147. skip_alphanum(t);
  148. switch str := string(t.data[token.offset:t.offset]); str {
  149. case "null": token.kind = Kind.Null;
  150. case "false": token.kind = Kind.False;
  151. case "true": token.kind = Kind.True;
  152. case:
  153. if t.spec == Specification.JSON5 do switch str {
  154. case "Infinity": token.kind = Kind.Infinity;
  155. case "NaN": token.kind = Kind.NaN;
  156. }
  157. }
  158. case '+':
  159. err = Error.Illegal_Character;
  160. if t.spec != Specification.JSON5 {
  161. break;
  162. }
  163. fallthrough;
  164. case '-':
  165. switch t.r {
  166. case '0'..'9':
  167. // Okay
  168. case:
  169. // Illegal use of +/-
  170. err = Error.Illegal_Character;
  171. if t.spec == Specification.JSON5 {
  172. if t.r == 'I' || t.r == 'N' {
  173. skip_alphanum(t);
  174. }
  175. switch string(t.data[token.offset:t.offset]) {
  176. case "-Infinity": token.kind = Kind.Infinity;
  177. case "-NaN": token.kind = Kind.NaN;
  178. }
  179. }
  180. break block;
  181. }
  182. fallthrough;
  183. case '0'..'9':
  184. token.kind = Kind.Integer;
  185. if t.spec == Specification.JSON5 { // Hexadecimal Numbers
  186. if curr_rune == '0' && (t.r == 'x' || t.r == 'X') {
  187. next_rune(t);
  188. skip_hex_digits(t);
  189. break;
  190. }
  191. }
  192. skip_digits(t);
  193. if t.r == '.' {
  194. token.kind = Kind.Float;
  195. next_rune(t);
  196. skip_digits(t);
  197. }
  198. if t.r == 'e' || t.r == 'E' {
  199. switch r := next_rune(t); r {
  200. case '+', '-':
  201. next_rune(t);
  202. }
  203. skip_digits(t);
  204. }
  205. str := string(t.data[token.offset:t.offset]);
  206. if !is_valid_number(str, t.spec) {
  207. err = Error.Invalid_Number;
  208. }
  209. case '.':
  210. err = Error.Illegal_Character;
  211. if t.spec == Specification.JSON5 { // Allow leading decimal point
  212. skip_digits(t);
  213. if t.r == 'e' || t.r == 'E' {
  214. switch r := next_rune(t); r {
  215. case '+', '-':
  216. next_rune(t);
  217. }
  218. skip_digits(t);
  219. }
  220. str := string(t.data[token.offset:t.offset]);
  221. if !is_valid_number(str, t.spec) {
  222. err = Error.Invalid_Number;
  223. }
  224. }
  225. case '\'':
  226. err = Error.Illegal_Character;
  227. if t.spec != Specification.JSON5 {
  228. break;
  229. }
  230. fallthrough;
  231. case '"':
  232. token.kind = Kind.String;
  233. quote := curr_rune;
  234. for t.offset < len(t.data) {
  235. r := t.r;
  236. if r == '\n' || r < 0 {
  237. err = Error.String_Not_Terminated;
  238. break;
  239. }
  240. next_rune(t);
  241. if r == quote {
  242. break;
  243. }
  244. if r == '\\' {
  245. scan_espace(t);
  246. }
  247. }
  248. str := string(t.data[token.offset : t.offset]);
  249. if !is_valid_string_literal(str, t.spec) {
  250. err = Error.Invalid_String;
  251. }
  252. case ',': token.kind = Kind.Comma;
  253. case ':': token.kind = Kind.Colon;
  254. case '{': token.kind = Kind.Open_Brace;
  255. case '}': token.kind = Kind.Close_Brace;
  256. case '[': token.kind = Kind.Open_Bracket;
  257. case ']': token.kind = Kind.Close_Bracket;
  258. case '/':
  259. err = Error.Illegal_Character;
  260. if t.spec == Specification.JSON5 {
  261. switch t.r {
  262. case '/':
  263. // Single-line comments
  264. skip_to_next_line(t);
  265. return get_token(t);
  266. case '*':
  267. // None-nested multi-line comments
  268. for t.offset < len(t.data) {
  269. next_rune(t);
  270. if t.r == '*' {
  271. next_rune(t);
  272. if t.r == '/' {
  273. next_rune(t);
  274. return get_token(t);
  275. }
  276. }
  277. }
  278. err = Error.EOF;
  279. }
  280. }
  281. case: err = Error.Illegal_Character;
  282. }
  283. token.text = string(t.data[token.offset : t.offset]);
  284. return;
  285. }
  286. is_valid_number :: proc(str: string, spec: Specification) -> bool {
  287. s := str;
  288. if s == "" {
  289. return false;
  290. }
  291. if s[0] == '-' {
  292. s = s[1:];
  293. if s == "" {
  294. return false;
  295. }
  296. } else if spec == Specification.JSON5 {
  297. if s[0] == '+' { // Allow positive sign
  298. s = s[1:];
  299. if s == "" {
  300. return false;
  301. }
  302. }
  303. }
  304. switch s[0] {
  305. case '0':
  306. s = s[1:];
  307. case '1'..'9':
  308. s = s[1:];
  309. for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:];
  310. case '.':
  311. if spec == Specification.JSON5 { // Allow leading decimal point
  312. s = s[1:];
  313. } else {
  314. return false;
  315. }
  316. case:
  317. return false;
  318. }
  319. if spec == Specification.JSON5 {
  320. if len(s) == 1 && s[0] == '.' { // Allow trailing decimal point
  321. return true;
  322. }
  323. }
  324. if len(s) >= 2 && s[0] == '.' && '0' <= s[1] && s[1] <= '9' {
  325. s = s[2:];
  326. for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:];
  327. }
  328. if len(s) >= 2 && (s[0] == 'e' || s[0] == 'E') {
  329. s = s[1:];
  330. switch s[0] {
  331. case '+', '-':
  332. s = s[1:];
  333. if s == "" {
  334. return false;
  335. }
  336. }
  337. for len(s) > 0 && '0' <= s[0] && s[0] <= '9' do s = s[1:];
  338. }
  339. // The string should be empty now to be valid
  340. return s == "";
  341. }
  342. is_valid_string_literal :: proc(str: string, spec: Specification) -> bool {
  343. s := str;
  344. if len(s) < 2 {
  345. return false;
  346. }
  347. quote := s[0];
  348. if s[0] != s[len(s)-1] {
  349. return false;
  350. }
  351. if s[0] != '"' || s[len(s)-1] != '"' {
  352. if spec == Specification.JSON5 {
  353. if s[0] != '\'' || s[len(s)-1] != '\'' {
  354. return false;
  355. }
  356. } else {
  357. return false;
  358. }
  359. }
  360. s = s[1 : len(s)-1];
  361. i := 0;
  362. for i < len(s) {
  363. c := s[i];
  364. switch {
  365. case c == '\\':
  366. i += 1;
  367. if i >= len(s) {
  368. return false;
  369. }
  370. switch s[i] {
  371. case '"', '\'', '\\', '/', 'b', 'n', 'r', 't', 'f':
  372. i += 1;
  373. case 'u':
  374. if i >= len(s) {
  375. return false;
  376. }
  377. hex := s[i+1:];
  378. if len(hex) < 4 {
  379. return false;
  380. }
  381. hex = hex[:4];
  382. i += 5;
  383. for j := 0; j < 4; j += 1 {
  384. c2 := hex[j];
  385. switch c2 {
  386. case '0'..'9', 'a'..'z', 'A'..'Z':
  387. // Okay
  388. case:
  389. return false;
  390. }
  391. }
  392. case: return false;
  393. }
  394. case c == quote, c < ' ':
  395. return false;
  396. case c < utf8.RUNE_SELF:
  397. i += 1;
  398. case:
  399. r, width := utf8.decode_rune_in_string(s[i:]);
  400. if r == utf8.RUNE_ERROR && width == 1 {
  401. return false;
  402. }
  403. i += width;
  404. }
  405. }
  406. if i == len(s) {
  407. return true;
  408. }
  409. return true;
  410. }