tb_parser.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446
  1. // ================================================================================
  2. // == This file is a part of Turbo Badger. (C) 2011-2014, Emil Segerås ==
  3. // == See tb_core.h for more information. ==
  4. // ================================================================================
  5. #include "parser/tb_parser.h"
  6. #include "tb_tempbuffer.h"
  7. #include "utf8/utf8.h"
  8. #include <assert.h>
  9. #include <ctype.h>
  10. namespace tb {
  11. // == Util functions ====================================================================
  12. static bool is_hex(char c)
  13. {
  14. return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'));
  15. }
  16. static uint32 parse_hex(char *&src, int max_count)
  17. {
  18. uint32 hex = 0;
  19. for (int i = 0; i < max_count; i++)
  20. {
  21. char c = *src;
  22. if (!is_hex(c))
  23. break;
  24. hex <<= 4;
  25. hex |= isdigit(c) ? c - '0' : tolower(c) - 'a' + 10;
  26. src++;
  27. }
  28. return hex;
  29. }
  30. void UnescapeString(char *str)
  31. {
  32. // fast forward to any escape sequence
  33. while (*str && *str != '\\')
  34. str++;
  35. char *dst = str, *src = str;
  36. while (*src)
  37. {
  38. if (*src == '\\')
  39. {
  40. bool code_found = true;
  41. switch (src[1])
  42. {
  43. case 'a': *dst = '\a'; break;
  44. case 'b': *dst = '\b'; break;
  45. case 'f': *dst = '\f'; break;
  46. case 'n': *dst = '\n'; break;
  47. case 'r': *dst = '\r'; break;
  48. case 't': *dst = '\t'; break;
  49. case 'v': *dst = '\v'; break;
  50. case '0': *dst = '\0'; break;
  51. case '\"': *dst = '\"'; break;
  52. case '\'': *dst = '\''; break;
  53. case '\\': *dst = '\\'; break;
  54. case 'x': // \xXX
  55. case 'u': // \uXXXX
  56. {
  57. // This should be safe. A utf-8 character can be at most 4 bytes,
  58. // and we have 4 bytes to use for \xXX and 6 for \uXXXX.
  59. src += 2;
  60. if (UCS4 hex = parse_hex(src, src[1] == 'x' ? 2 : 4))
  61. dst += utf8::encode(hex, dst);
  62. continue;
  63. }
  64. default:
  65. code_found = false;
  66. }
  67. if (code_found)
  68. {
  69. src += 2;
  70. dst++;
  71. continue;
  72. }
  73. }
  74. *dst = *src;
  75. dst++;
  76. src++;
  77. }
  78. *dst = 0;
  79. }
  80. bool is_white_space(const char *str)
  81. {
  82. switch (*str)
  83. {
  84. case ' ':
  85. case '\t':
  86. return true;
  87. default:
  88. return false;
  89. }
  90. }
  91. /** Return true if the given string starts with a color.
  92. Ex: #ffdd00, #fd0 */
  93. bool is_start_of_color(const char *str)
  94. {
  95. if (*str++ != '#')
  96. return false;
  97. int digit_count = 0;
  98. while (is_hex(*str))
  99. {
  100. str++;
  101. digit_count++;
  102. }
  103. return digit_count == 8 || digit_count == 6 || digit_count == 4 || digit_count == 3;
  104. }
  105. /** Return true if the given string may be a node reference, such
  106. as language strings or TBNodeRefTree references. */
  107. bool is_start_of_reference(const char *str)
  108. {
  109. if (*str++ != '@')
  110. return false;
  111. while (*str && *str != ' ')
  112. {
  113. // If the token ends with colon, it's not a value but a key.
  114. if (*str == ':')
  115. return false;
  116. str++;
  117. }
  118. return true;
  119. }
  120. /** Check if the line is a comment or empty space. If it is, consume the leading
  121. whitespace from line. */
  122. bool is_space_or_comment(char *&line)
  123. {
  124. char *tmp = line;
  125. while (is_white_space(tmp))
  126. tmp++;
  127. if (*tmp == '#' || *tmp == 0)
  128. {
  129. line = tmp;
  130. return true;
  131. }
  132. return false;
  133. }
  134. bool is_pending_multiline(const char *str)
  135. {
  136. while (is_white_space(str))
  137. str++;
  138. return str[0] == '\\' && str[1] == 0;
  139. }
  140. bool IsEndQuote(const char *buf_start, const char *buf, const char quote_type)
  141. {
  142. if (*buf != quote_type)
  143. return false;
  144. int num_backslashes = 0;
  145. while (buf_start < buf && *(buf-- - 1) == '\\')
  146. num_backslashes++;
  147. return !(num_backslashes & 1);
  148. }
  149. // == Parser ============================================================================
  150. TBParser::STATUS TBParser::Read(TBParserStream *stream, TBParserTarget *target)
  151. {
  152. TBTempBuffer line, work;
  153. if (!line.Reserve(1024) || !work.Reserve(1024))
  154. return STATUS_OUT_OF_MEMORY;
  155. current_indent = 0;
  156. current_line_nr = 1;
  157. pending_multiline = false;
  158. multi_line_sub_level = 0;
  159. while (int read_len = stream->GetMoreData((char *)work.GetData(), work.GetCapacity()))
  160. {
  161. char *buf = work.GetData();
  162. // Skip BOM (BYTE ORDER MARK) character, often in the beginning of UTF-8 documents.
  163. if (current_line_nr == 1 && read_len > 3 &&
  164. (uint8)buf[0] == 239 &&
  165. (uint8)buf[1] == 187 &&
  166. (uint8)buf[2] == 191)
  167. {
  168. read_len -= 3;
  169. buf += 3;
  170. }
  171. int line_pos = 0;
  172. while (true)
  173. {
  174. // Find line end
  175. int line_start = line_pos;
  176. while (line_pos < read_len && buf[line_pos] != '\n')
  177. line_pos++;
  178. if (line_pos < read_len)
  179. {
  180. // We have a line
  181. // Skip preceding \r (if we have one)
  182. int line_len = line_pos - line_start;
  183. if (!line.Append(buf + line_start, line_len))
  184. return STATUS_OUT_OF_MEMORY;
  185. // Strip away trailing '\r' if the line has it
  186. char *linebuf = line.GetData();
  187. int linebuf_len = line.GetAppendPos();
  188. if (linebuf_len > 0 && linebuf[linebuf_len - 1] == '\r')
  189. linebuf[linebuf_len - 1] = 0;
  190. // Terminate the line string
  191. if (!line.Append("", 1))
  192. return STATUS_OUT_OF_MEMORY;
  193. // Handle line
  194. OnLine(line.GetData(), target);
  195. current_line_nr++;
  196. line.ResetAppendPos();
  197. line_pos++; // Skip this \n
  198. // Find next line
  199. continue;
  200. }
  201. // No more lines here so push the rest and break for more data
  202. if (!line.Append(buf + line_start, read_len - line_start))
  203. return STATUS_OUT_OF_MEMORY;
  204. break;
  205. }
  206. }
  207. if (line.GetAppendPos())
  208. {
  209. if (!line.Append("", 1))
  210. return STATUS_OUT_OF_MEMORY;
  211. OnLine(line.GetData(), target);
  212. current_line_nr++;
  213. }
  214. return STATUS_OK;
  215. }
  216. void TBParser::OnLine(char *line, TBParserTarget *target)
  217. {
  218. if (is_space_or_comment(line))
  219. {
  220. if (*line == '#')
  221. target->OnComment(current_line_nr, line + 1);
  222. return;
  223. }
  224. if (pending_multiline)
  225. {
  226. OnMultiline(line, target);
  227. return;
  228. }
  229. // Check indent
  230. int indent = 0;
  231. while (line[indent] == '\t' && line[indent] != 0)
  232. indent++;
  233. line += indent;
  234. if (indent - current_indent > 1)
  235. {
  236. target->OnError(current_line_nr, "Indentation error. (Line skipped)");
  237. return;
  238. }
  239. if (indent > current_indent)
  240. {
  241. // FIX: Report indentation error if more than 1 higher!
  242. assert(indent - current_indent == 1);
  243. target->Enter();
  244. current_indent++;
  245. }
  246. else if (indent < current_indent)
  247. {
  248. while (indent < current_indent)
  249. {
  250. target->Leave();
  251. current_indent--;
  252. }
  253. }
  254. if (*line == 0)
  255. return;
  256. else
  257. {
  258. char *token = line;
  259. // Read line while consuming it and copy over to token buf
  260. while (!is_white_space(line) && *line != 0)
  261. line++;
  262. int token_len = line - token;
  263. // Consume any white space after the token
  264. while (is_white_space(line))
  265. line++;
  266. bool is_compact_line = token_len && token[token_len - 1] == ':';
  267. TBValue value;
  268. if (is_compact_line)
  269. {
  270. token_len--;
  271. token[token_len] = 0;
  272. // Check if the first argument is not a child but the value for this token
  273. if (*line == '[' || *line == '\"' || *line == '\'' ||
  274. is_start_of_number(line) ||
  275. is_start_of_color(line) ||
  276. is_start_of_reference(line))
  277. {
  278. ConsumeValue(value, line);
  279. if (pending_multiline)
  280. {
  281. // The value wrapped to the next line, so we should remember the token and continue.
  282. multi_line_token.Set(token);
  283. return;
  284. }
  285. }
  286. }
  287. else if (token[token_len])
  288. {
  289. token[token_len] = 0;
  290. UnescapeString(line);
  291. value.SetFromStringAuto(line, TBValue::SET_AS_STATIC);
  292. }
  293. target->OnToken(current_line_nr, token, value);
  294. if (is_compact_line)
  295. OnCompactLine(line, target);
  296. }
  297. }
  298. void TBParser::OnCompactLine(char *line, TBParserTarget *target)
  299. {
  300. target->Enter();
  301. while (*line)
  302. {
  303. // consume any whitespace
  304. while (is_white_space(line))
  305. line++;
  306. // Find token
  307. char *token = line;
  308. while (*line != ':' && *line != 0)
  309. line++;
  310. if (!*line)
  311. break; // Syntax error, expected token
  312. *line++ = 0;
  313. // consume any whitespace
  314. while (is_white_space(line))
  315. line++;
  316. TBValue v;
  317. ConsumeValue(v, line);
  318. if (pending_multiline)
  319. {
  320. // The value wrapped to the next line, so we should remember the token and continue.
  321. multi_line_token.Set(token);
  322. // Since we need to call target->Leave when the multiline is ready, set multi_line_sub_level.
  323. multi_line_sub_level = 1;
  324. return;
  325. }
  326. // Ready
  327. target->OnToken(current_line_nr, token, v);
  328. }
  329. target->Leave();
  330. }
  331. void TBParser::OnMultiline(char *line, TBParserTarget *target)
  332. {
  333. // consume any whitespace
  334. while (is_white_space(line))
  335. line++;
  336. TBValue value;
  337. ConsumeValue(value, line);
  338. if (!pending_multiline)
  339. {
  340. // Ready with all lines
  341. value.SetString(multi_line_value.GetData(), TBValue::SET_AS_STATIC);
  342. target->OnToken(current_line_nr, multi_line_token, value);
  343. if (multi_line_sub_level)
  344. target->Leave();
  345. // Reset
  346. multi_line_value.SetAppendPos(0);
  347. multi_line_sub_level = 0;
  348. }
  349. }
  350. void TBParser::ConsumeValue(TBValue &dst_value, char *&line)
  351. {
  352. // Find value (As quoted string, or as auto)
  353. char *value = line;
  354. if (*line == '\"' || *line == '\'')
  355. {
  356. const char quote_type = *line;
  357. // Consume starting quote
  358. line++;
  359. value++;
  360. // Find ending quote or end
  361. while (!IsEndQuote(value, line, quote_type) && *line != 0)
  362. line++;
  363. // Terminate away the quote
  364. if (*line == quote_type)
  365. *line++ = 0;
  366. // consume any whitespace
  367. while (is_white_space(line))
  368. line++;
  369. // consume any comma
  370. if (*line == ',')
  371. line++;
  372. UnescapeString(value);
  373. dst_value.SetString(value, TBValue::SET_AS_STATIC);
  374. }
  375. else
  376. {
  377. // Find next comma or end
  378. while (*line != ',' && *line != 0)
  379. line++;
  380. // Terminate away the comma
  381. if (*line == ',')
  382. *line++ = 0;
  383. UnescapeString(value);
  384. dst_value.SetFromStringAuto(value, TBValue::SET_AS_STATIC);
  385. }
  386. // Check if we still have pending value data on the following line and set pending_multiline.
  387. bool continuing_multiline = pending_multiline;
  388. pending_multiline = is_pending_multiline(line);
  389. // Append the multi line value to the buffer.
  390. if (continuing_multiline || pending_multiline)
  391. multi_line_value.AppendString(dst_value.GetString());
  392. }
  393. }; // namespace tb