2
0

gdscript_tokenizer_buffer.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. /**************************************************************************/
  2. /* gdscript_tokenizer_buffer.cpp */
  3. /**************************************************************************/
  4. /* This file is part of: */
  5. /* GODOT ENGINE */
  6. /* https://godotengine.org */
  7. /**************************************************************************/
  8. /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
  9. /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
  10. /* */
  11. /* Permission is hereby granted, free of charge, to any person obtaining */
  12. /* a copy of this software and associated documentation files (the */
  13. /* "Software"), to deal in the Software without restriction, including */
  14. /* without limitation the rights to use, copy, modify, merge, publish, */
  15. /* distribute, sublicense, and/or sell copies of the Software, and to */
  16. /* permit persons to whom the Software is furnished to do so, subject to */
  17. /* the following conditions: */
  18. /* */
  19. /* The above copyright notice and this permission notice shall be */
  20. /* included in all copies or substantial portions of the Software. */
  21. /* */
  22. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
  23. /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
  24. /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
  25. /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
  26. /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
  27. /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
  28. /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  29. /**************************************************************************/
  30. #include "gdscript_tokenizer_buffer.h"
  31. #include "core/io/compression.h"
  32. #include "core/io/marshalls.h"
  33. int GDScriptTokenizerBuffer::_token_to_binary(const Token &p_token, Vector<uint8_t> &r_buffer, int p_start, HashMap<StringName, uint32_t> &r_identifiers_map, HashMap<Variant, uint32_t, VariantHasher, VariantComparator> &r_constants_map) {
  34. int pos = p_start;
  35. int token_type = p_token.type & TOKEN_MASK;
  36. switch (p_token.type) {
  37. case GDScriptTokenizer::Token::ANNOTATION:
  38. case GDScriptTokenizer::Token::IDENTIFIER: {
  39. // Add identifier to map.
  40. int identifier_pos;
  41. StringName id = p_token.get_identifier();
  42. if (r_identifiers_map.has(id)) {
  43. identifier_pos = r_identifiers_map[id];
  44. } else {
  45. identifier_pos = r_identifiers_map.size();
  46. r_identifiers_map[id] = identifier_pos;
  47. }
  48. token_type |= identifier_pos << TOKEN_BITS;
  49. } break;
  50. case GDScriptTokenizer::Token::ERROR:
  51. case GDScriptTokenizer::Token::LITERAL: {
  52. // Add literal to map.
  53. int constant_pos;
  54. if (r_constants_map.has(p_token.literal)) {
  55. constant_pos = r_constants_map[p_token.literal];
  56. } else {
  57. constant_pos = r_constants_map.size();
  58. r_constants_map[p_token.literal] = constant_pos;
  59. }
  60. token_type |= constant_pos << TOKEN_BITS;
  61. } break;
  62. default:
  63. break;
  64. }
  65. // Encode token.
  66. int token_len;
  67. if (token_type & TOKEN_MASK) {
  68. token_len = 8;
  69. r_buffer.resize(pos + token_len);
  70. encode_uint32(token_type | TOKEN_BYTE_MASK, &r_buffer.write[pos]);
  71. pos += 4;
  72. } else {
  73. token_len = 5;
  74. r_buffer.resize(pos + token_len);
  75. r_buffer.write[pos] = token_type;
  76. pos++;
  77. }
  78. encode_uint32(p_token.start_line, &r_buffer.write[pos]);
  79. return token_len;
  80. }
  81. GDScriptTokenizer::Token GDScriptTokenizerBuffer::_binary_to_token(const uint8_t *p_buffer) {
  82. Token token;
  83. const uint8_t *b = p_buffer;
  84. uint32_t token_type = decode_uint32(b);
  85. token.type = (Token::Type)(token_type & TOKEN_MASK);
  86. if (token_type & TOKEN_BYTE_MASK) {
  87. b += 4;
  88. } else {
  89. b++;
  90. }
  91. token.start_line = decode_uint32(b);
  92. token.end_line = token.start_line;
  93. token.literal = token.get_name();
  94. if (token.type == Token::CONST_NAN) {
  95. token.literal = String("NAN"); // Special case since name and notation are different.
  96. }
  97. switch (token.type) {
  98. case GDScriptTokenizer::Token::ANNOTATION:
  99. case GDScriptTokenizer::Token::IDENTIFIER: {
  100. // Get name from map.
  101. int identifier_pos = token_type >> TOKEN_BITS;
  102. if (unlikely(identifier_pos >= identifiers.size())) {
  103. Token error;
  104. error.type = Token::ERROR;
  105. error.literal = "Identifier index out of bounds.";
  106. return error;
  107. }
  108. token.literal = identifiers[identifier_pos];
  109. } break;
  110. case GDScriptTokenizer::Token::ERROR:
  111. case GDScriptTokenizer::Token::LITERAL: {
  112. // Get literal from map.
  113. int constant_pos = token_type >> TOKEN_BITS;
  114. if (unlikely(constant_pos >= constants.size())) {
  115. Token error;
  116. error.type = Token::ERROR;
  117. error.literal = "Constant index out of bounds.";
  118. return error;
  119. }
  120. token.literal = constants[constant_pos];
  121. } break;
  122. default:
  123. break;
  124. }
  125. return token;
  126. }
  127. Error GDScriptTokenizerBuffer::set_code_buffer(const Vector<uint8_t> &p_buffer) {
  128. const uint8_t *buf = p_buffer.ptr();
  129. ERR_FAIL_COND_V(p_buffer.size() < 12 || p_buffer[0] != 'G' || p_buffer[1] != 'D' || p_buffer[2] != 'S' || p_buffer[3] != 'C', ERR_INVALID_DATA);
  130. int version = decode_uint32(&buf[4]);
  131. ERR_FAIL_COND_V_MSG(version != TOKENIZER_VERSION, ERR_INVALID_DATA, "Binary GDScript is not compatible with this engine version.");
  132. int decompressed_size = decode_uint32(&buf[8]);
  133. Vector<uint8_t> contents;
  134. if (decompressed_size == 0) {
  135. contents = p_buffer.slice(12);
  136. } else {
  137. contents.resize(decompressed_size);
  138. const int64_t result = Compression::decompress(contents.ptrw(), contents.size(), &buf[12], p_buffer.size() - 12, Compression::MODE_ZSTD);
  139. ERR_FAIL_COND_V_MSG(result != decompressed_size, ERR_INVALID_DATA, "Error decompressing GDScript tokenizer buffer.");
  140. }
  141. int total_len = contents.size();
  142. buf = contents.ptr();
  143. uint32_t identifier_count = decode_uint32(&buf[0]);
  144. uint32_t constant_count = decode_uint32(&buf[4]);
  145. uint32_t token_line_count = decode_uint32(&buf[8]);
  146. uint32_t token_count = decode_uint32(&buf[12]);
  147. const uint8_t *b = &buf[16];
  148. total_len -= 16;
  149. identifiers.resize(identifier_count);
  150. for (uint32_t i = 0; i < identifier_count; i++) {
  151. uint32_t len = decode_uint32(b);
  152. total_len -= 4;
  153. ERR_FAIL_COND_V((len * 4u) > (uint32_t)total_len, ERR_INVALID_DATA);
  154. b += 4;
  155. Vector<uint32_t> cs;
  156. cs.resize(len);
  157. for (uint32_t j = 0; j < len; j++) {
  158. uint8_t tmp[4];
  159. for (uint32_t k = 0; k < 4; k++) {
  160. tmp[k] = b[j * 4 + k] ^ 0xb6;
  161. }
  162. cs.write[j] = decode_uint32(tmp);
  163. }
  164. String s = String::utf32(Span(reinterpret_cast<const char32_t *>(cs.ptr()), len));
  165. b += len * 4;
  166. total_len -= len * 4;
  167. identifiers.write[i] = s;
  168. }
  169. constants.resize(constant_count);
  170. for (uint32_t i = 0; i < constant_count; i++) {
  171. Variant v;
  172. int len;
  173. Error err = decode_variant(v, b, total_len, &len, false);
  174. if (err) {
  175. return err;
  176. }
  177. b += len;
  178. total_len -= len;
  179. constants.write[i] = v;
  180. }
  181. for (uint32_t i = 0; i < token_line_count; i++) {
  182. ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
  183. uint32_t token_index = decode_uint32(b);
  184. b += 4;
  185. uint32_t line = decode_uint32(b);
  186. b += 4;
  187. total_len -= 8;
  188. token_lines[token_index] = line;
  189. }
  190. for (uint32_t i = 0; i < token_line_count; i++) {
  191. ERR_FAIL_COND_V(total_len < 8, ERR_INVALID_DATA);
  192. uint32_t token_index = decode_uint32(b);
  193. b += 4;
  194. uint32_t column = decode_uint32(b);
  195. b += 4;
  196. total_len -= 8;
  197. token_columns[token_index] = column;
  198. }
  199. tokens.resize(token_count);
  200. for (uint32_t i = 0; i < token_count; i++) {
  201. int token_len = 5;
  202. if ((*b) & TOKEN_BYTE_MASK) {
  203. token_len = 8;
  204. }
  205. ERR_FAIL_COND_V(total_len < token_len, ERR_INVALID_DATA);
  206. Token token = _binary_to_token(b);
  207. b += token_len;
  208. ERR_FAIL_INDEX_V(token.type, Token::TK_MAX, ERR_INVALID_DATA);
  209. tokens.write[i] = token;
  210. total_len -= token_len;
  211. }
  212. ERR_FAIL_COND_V(total_len > 0, ERR_INVALID_DATA);
  213. return OK;
  214. }
  215. Vector<uint8_t> GDScriptTokenizerBuffer::parse_code_string(const String &p_code, CompressMode p_compress_mode) {
  216. HashMap<StringName, uint32_t> identifier_map;
  217. HashMap<Variant, uint32_t, VariantHasher, VariantComparator> constant_map;
  218. Vector<uint8_t> token_buffer;
  219. HashMap<uint32_t, uint32_t> token_lines;
  220. HashMap<uint32_t, uint32_t> token_columns;
  221. GDScriptTokenizerText tokenizer;
  222. tokenizer.set_source_code(p_code);
  223. tokenizer.set_multiline_mode(true); // Ignore whitespace tokens.
  224. Token current = tokenizer.scan();
  225. int token_pos = 0;
  226. int last_token_line = 0;
  227. int token_counter = 0;
  228. while (current.type != Token::TK_EOF) {
  229. int token_len = _token_to_binary(current, token_buffer, token_pos, identifier_map, constant_map);
  230. token_pos += token_len;
  231. if (token_counter > 0 && current.start_line > last_token_line) {
  232. token_lines[token_counter] = current.start_line;
  233. token_columns[token_counter] = current.start_column;
  234. }
  235. last_token_line = current.end_line;
  236. current = tokenizer.scan();
  237. token_counter++;
  238. }
  239. // Reverse maps.
  240. Vector<StringName> rev_identifier_map;
  241. rev_identifier_map.resize(identifier_map.size());
  242. for (const KeyValue<StringName, uint32_t> &E : identifier_map) {
  243. rev_identifier_map.write[E.value] = E.key;
  244. }
  245. Vector<Variant> rev_constant_map;
  246. rev_constant_map.resize(constant_map.size());
  247. for (const KeyValue<Variant, uint32_t> &E : constant_map) {
  248. rev_constant_map.write[E.value] = E.key;
  249. }
  250. HashMap<uint32_t, uint32_t> rev_token_lines;
  251. for (const KeyValue<uint32_t, uint32_t> &E : token_lines) {
  252. rev_token_lines[E.value] = E.key;
  253. }
  254. // Remove continuation lines from map.
  255. for (int line : tokenizer.get_continuation_lines()) {
  256. if (rev_token_lines.has(line)) {
  257. token_lines.erase(rev_token_lines[line]);
  258. token_columns.erase(rev_token_lines[line]);
  259. }
  260. }
  261. Vector<uint8_t> contents;
  262. contents.resize(16);
  263. encode_uint32(identifier_map.size(), &contents.write[0]);
  264. encode_uint32(constant_map.size(), &contents.write[4]);
  265. encode_uint32(token_lines.size(), &contents.write[8]);
  266. encode_uint32(token_counter, &contents.write[12]);
  267. int buf_pos = 16;
  268. // Save identifiers.
  269. for (const StringName &id : rev_identifier_map) {
  270. String s = id.operator String();
  271. int len = s.length();
  272. contents.resize(buf_pos + (len + 1) * 4);
  273. encode_uint32(len, &contents.write[buf_pos]);
  274. buf_pos += 4;
  275. for (int i = 0; i < len; i++) {
  276. uint8_t tmp[4];
  277. encode_uint32(s[i], tmp);
  278. for (int b = 0; b < 4; b++) {
  279. contents.write[buf_pos + b] = tmp[b] ^ 0xb6;
  280. }
  281. buf_pos += 4;
  282. }
  283. }
  284. // Save constants.
  285. for (const Variant &v : rev_constant_map) {
  286. int len;
  287. // Objects cannot be constant, never encode objects.
  288. Error err = encode_variant(v, nullptr, len, false);
  289. ERR_FAIL_COND_V_MSG(err != OK, Vector<uint8_t>(), "Error when trying to encode Variant.");
  290. contents.resize(buf_pos + len);
  291. encode_variant(v, &contents.write[buf_pos], len, false);
  292. buf_pos += len;
  293. }
  294. // Save lines and columns.
  295. contents.resize(buf_pos + token_lines.size() * 16);
  296. for (const KeyValue<uint32_t, uint32_t> &e : token_lines) {
  297. encode_uint32(e.key, &contents.write[buf_pos]);
  298. buf_pos += 4;
  299. encode_uint32(e.value, &contents.write[buf_pos]);
  300. buf_pos += 4;
  301. }
  302. for (const KeyValue<uint32_t, uint32_t> &e : token_columns) {
  303. encode_uint32(e.key, &contents.write[buf_pos]);
  304. buf_pos += 4;
  305. encode_uint32(e.value, &contents.write[buf_pos]);
  306. buf_pos += 4;
  307. }
  308. // Store tokens.
  309. contents.append_array(token_buffer);
  310. Vector<uint8_t> buf;
  311. // Save header.
  312. buf.resize(12);
  313. buf.write[0] = 'G';
  314. buf.write[1] = 'D';
  315. buf.write[2] = 'S';
  316. buf.write[3] = 'C';
  317. encode_uint32(TOKENIZER_VERSION, &buf.write[4]);
  318. switch (p_compress_mode) {
  319. case COMPRESS_NONE:
  320. encode_uint32(0u, &buf.write[8]);
  321. buf.append_array(contents);
  322. break;
  323. case COMPRESS_ZSTD: {
  324. encode_uint32(contents.size(), &buf.write[8]);
  325. Vector<uint8_t> compressed;
  326. const int64_t max_size = Compression::get_max_compressed_buffer_size(contents.size(), Compression::MODE_ZSTD);
  327. compressed.resize(max_size);
  328. const int64_t compressed_size = Compression::compress(compressed.ptrw(), contents.ptr(), contents.size(), Compression::MODE_ZSTD);
  329. ERR_FAIL_COND_V_MSG(compressed_size < 0, Vector<uint8_t>(), "Error compressing GDScript tokenizer buffer.");
  330. compressed.resize(compressed_size);
  331. buf.append_array(compressed);
  332. } break;
  333. }
  334. return buf;
  335. }
  336. int GDScriptTokenizerBuffer::get_cursor_line() const {
  337. return 0;
  338. }
  339. int GDScriptTokenizerBuffer::get_cursor_column() const {
  340. return 0;
  341. }
  342. void GDScriptTokenizerBuffer::set_cursor_position(int p_line, int p_column) {
  343. }
  344. void GDScriptTokenizerBuffer::set_multiline_mode(bool p_state) {
  345. multiline_mode = p_state;
  346. }
  347. bool GDScriptTokenizerBuffer::is_past_cursor() const {
  348. return false;
  349. }
  350. void GDScriptTokenizerBuffer::push_expression_indented_block() {
  351. indent_stack_stack.push_back(indent_stack);
  352. }
  353. void GDScriptTokenizerBuffer::pop_expression_indented_block() {
  354. ERR_FAIL_COND(indent_stack_stack.is_empty());
  355. indent_stack = indent_stack_stack.back()->get();
  356. indent_stack_stack.pop_back();
  357. }
  358. GDScriptTokenizer::Token GDScriptTokenizerBuffer::scan() {
  359. // Add final newline.
  360. if (current >= tokens.size() && !last_token_was_newline) {
  361. Token newline;
  362. newline.type = Token::NEWLINE;
  363. newline.start_line = current_line;
  364. newline.end_line = current_line;
  365. last_token_was_newline = true;
  366. return newline;
  367. }
  368. // Resolve pending indentation change.
  369. if (pending_indents > 0) {
  370. pending_indents--;
  371. Token indent;
  372. indent.type = Token::INDENT;
  373. indent.start_line = current_line;
  374. indent.end_line = current_line;
  375. return indent;
  376. } else if (pending_indents < 0) {
  377. pending_indents++;
  378. Token dedent;
  379. dedent.type = Token::DEDENT;
  380. dedent.start_line = current_line;
  381. dedent.end_line = current_line;
  382. return dedent;
  383. }
  384. if (current >= tokens.size()) {
  385. if (!indent_stack.is_empty()) {
  386. pending_indents -= indent_stack.size();
  387. indent_stack.clear();
  388. return scan();
  389. }
  390. Token eof;
  391. eof.type = Token::TK_EOF;
  392. return eof;
  393. };
  394. if (!last_token_was_newline && token_lines.has(current)) {
  395. current_line = token_lines[current];
  396. uint32_t current_column = token_columns[current];
  397. // Check if there's a need to indent/dedent.
  398. if (!multiline_mode) {
  399. uint32_t previous_indent = 0;
  400. if (!indent_stack.is_empty()) {
  401. previous_indent = indent_stack.back()->get();
  402. }
  403. if (current_column - 1 > previous_indent) {
  404. pending_indents++;
  405. indent_stack.push_back(current_column - 1);
  406. } else {
  407. while (current_column - 1 < previous_indent) {
  408. pending_indents--;
  409. indent_stack.pop_back();
  410. if (indent_stack.is_empty()) {
  411. break;
  412. }
  413. previous_indent = indent_stack.back()->get();
  414. }
  415. }
  416. Token newline;
  417. newline.type = Token::NEWLINE;
  418. newline.start_line = current_line;
  419. newline.end_line = current_line;
  420. last_token_was_newline = true;
  421. return newline;
  422. }
  423. }
  424. last_token_was_newline = false;
  425. Token token = tokens[current++];
  426. return token;
  427. }