2
0

gdscript_tokenizer.cpp 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555
  1. /**************************************************************************/
  2. /* gdscript_tokenizer.cpp */
  3. /**************************************************************************/
  4. /* This file is part of: */
  5. /* GODOT ENGINE */
  6. /* https://godotengine.org */
  7. /**************************************************************************/
  8. /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
  9. /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
  10. /* */
  11. /* Permission is hereby granted, free of charge, to any person obtaining */
  12. /* a copy of this software and associated documentation files (the */
  13. /* "Software"), to deal in the Software without restriction, including */
  14. /* without limitation the rights to use, copy, modify, merge, publish, */
  15. /* distribute, sublicense, and/or sell copies of the Software, and to */
  16. /* permit persons to whom the Software is furnished to do so, subject to */
  17. /* the following conditions: */
  18. /* */
  19. /* The above copyright notice and this permission notice shall be */
  20. /* included in all copies or substantial portions of the Software. */
  21. /* */
  22. /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
  23. /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
  24. /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
  25. /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
  26. /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
  27. /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
  28. /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
  29. /**************************************************************************/
  30. #include "gdscript_tokenizer.h"
  31. #include "core/error/error_macros.h"
  32. #include "core/string/char_utils.h"
  33. #ifdef TOOLS_ENABLED
  34. #include "editor/editor_settings.h"
  35. #endif
  36. #ifdef DEBUG_ENABLED
  37. #include "servers/text_server.h"
  38. #endif
  39. static const char *token_names[] = {
  40. "Empty", // EMPTY,
  41. // Basic
  42. "Annotation", // ANNOTATION
  43. "Identifier", // IDENTIFIER,
  44. "Literal", // LITERAL,
  45. // Comparison
  46. "<", // LESS,
  47. "<=", // LESS_EQUAL,
  48. ">", // GREATER,
  49. ">=", // GREATER_EQUAL,
  50. "==", // EQUAL_EQUAL,
  51. "!=", // BANG_EQUAL,
  52. // Logical
  53. "and", // AND,
  54. "or", // OR,
  55. "not", // NOT,
  56. "&&", // AMPERSAND_AMPERSAND,
  57. "||", // PIPE_PIPE,
  58. "!", // BANG,
  59. // Bitwise
  60. "&", // AMPERSAND,
  61. "|", // PIPE,
  62. "~", // TILDE,
  63. "^", // CARET,
  64. "<<", // LESS_LESS,
  65. ">>", // GREATER_GREATER,
  66. // Math
  67. "+", // PLUS,
  68. "-", // MINUS,
  69. "*", // STAR,
  70. "**", // STAR_STAR,
  71. "/", // SLASH,
  72. "%", // PERCENT,
  73. // Assignment
  74. "=", // EQUAL,
  75. "+=", // PLUS_EQUAL,
  76. "-=", // MINUS_EQUAL,
  77. "*=", // STAR_EQUAL,
  78. "**=", // STAR_STAR_EQUAL,
  79. "/=", // SLASH_EQUAL,
  80. "%=", // PERCENT_EQUAL,
  81. "<<=", // LESS_LESS_EQUAL,
  82. ">>=", // GREATER_GREATER_EQUAL,
  83. "&=", // AMPERSAND_EQUAL,
  84. "|=", // PIPE_EQUAL,
  85. "^=", // CARET_EQUAL,
  86. // Control flow
  87. "if", // IF,
  88. "elif", // ELIF,
  89. "else", // ELSE,
  90. "for", // FOR,
  91. "while", // WHILE,
  92. "break", // BREAK,
  93. "continue", // CONTINUE,
  94. "pass", // PASS,
  95. "return", // RETURN,
  96. "match", // MATCH,
  97. // Keywords
  98. "as", // AS,
  99. "assert", // ASSERT,
  100. "await", // AWAIT,
  101. "breakpoint", // BREAKPOINT,
  102. "class", // CLASS,
  103. "class_name", // CLASS_NAME,
  104. "const", // CONST,
  105. "enum", // ENUM,
  106. "extends", // EXTENDS,
  107. "func", // FUNC,
  108. "in", // IN,
  109. "is", // IS,
  110. "namespace", // NAMESPACE
  111. "preload", // PRELOAD,
  112. "self", // SELF,
  113. "signal", // SIGNAL,
  114. "static", // STATIC,
  115. "super", // SUPER,
  116. "trait", // TRAIT,
  117. "var", // VAR,
  118. "void", // VOID,
  119. "yield", // YIELD,
  120. // Punctuation
  121. "[", // BRACKET_OPEN,
  122. "]", // BRACKET_CLOSE,
  123. "{", // BRACE_OPEN,
  124. "}", // BRACE_CLOSE,
  125. "(", // PARENTHESIS_OPEN,
  126. ")", // PARENTHESIS_CLOSE,
  127. ",", // COMMA,
  128. ";", // SEMICOLON,
  129. ".", // PERIOD,
  130. "..", // PERIOD_PERIOD,
  131. ":", // COLON,
  132. "$", // DOLLAR,
  133. "->", // FORWARD_ARROW,
  134. "_", // UNDERSCORE,
  135. // Whitespace
  136. "Newline", // NEWLINE,
  137. "Indent", // INDENT,
  138. "Dedent", // DEDENT,
  139. // Constants
  140. "PI", // CONST_PI,
  141. "TAU", // CONST_TAU,
  142. "INF", // CONST_INF,
  143. "NaN", // CONST_NAN,
  144. // Error message improvement
  145. "VCS conflict marker", // VCS_CONFLICT_MARKER,
  146. "`", // BACKTICK,
  147. "?", // QUESTION_MARK,
  148. // Special
  149. "Error", // ERROR,
  150. "End of file", // EOF,
  151. };
  152. // Avoid desync.
  153. static_assert(sizeof(token_names) / sizeof(token_names[0]) == GDScriptTokenizer::Token::TK_MAX, "Amount of token names don't match the amount of token types.");
  154. const char *GDScriptTokenizer::Token::get_name() const {
  155. ERR_FAIL_INDEX_V_MSG(type, TK_MAX, "<error>", "Using token type out of the enum.");
  156. return token_names[type];
  157. }
  158. bool GDScriptTokenizer::Token::is_identifier() const {
  159. // Note: Most keywords should not be recognized as identifiers.
  160. // These are only exceptions for stuff that already is on the engine's API.
  161. switch (type) {
  162. case IDENTIFIER:
  163. case MATCH: // Used in String.match().
  164. case CONST_INF: // Used in Vector{2,3,4}.INF
  165. return true;
  166. default:
  167. return false;
  168. }
  169. }
  170. bool GDScriptTokenizer::Token::is_node_name() const {
  171. // This is meant to allow keywords with the $ notation, but not as general identifiers.
  172. switch (type) {
  173. case IDENTIFIER:
  174. case AND:
  175. case AS:
  176. case ASSERT:
  177. case AWAIT:
  178. case BREAK:
  179. case BREAKPOINT:
  180. case CLASS_NAME:
  181. case CLASS:
  182. case CONST:
  183. case CONTINUE:
  184. case ELIF:
  185. case ELSE:
  186. case ENUM:
  187. case EXTENDS:
  188. case FOR:
  189. case FUNC:
  190. case IF:
  191. case IN:
  192. case IS:
  193. case MATCH:
  194. case NAMESPACE:
  195. case NOT:
  196. case OR:
  197. case PASS:
  198. case PRELOAD:
  199. case RETURN:
  200. case SELF:
  201. case SIGNAL:
  202. case STATIC:
  203. case SUPER:
  204. case TRAIT:
  205. case UNDERSCORE:
  206. case VAR:
  207. case VOID:
  208. case WHILE:
  209. case YIELD:
  210. return true;
  211. default:
  212. return false;
  213. }
  214. }
  215. String GDScriptTokenizer::get_token_name(Token::Type p_token_type) {
  216. ERR_FAIL_INDEX_V_MSG(p_token_type, Token::TK_MAX, "<error>", "Using token type out of the enum.");
  217. return token_names[p_token_type];
  218. }
  219. void GDScriptTokenizer::set_source_code(const String &p_source_code) {
  220. source = p_source_code;
  221. if (source.is_empty()) {
  222. _source = U"";
  223. } else {
  224. _source = source.ptr();
  225. }
  226. _current = _source;
  227. line = 1;
  228. column = 1;
  229. length = p_source_code.length();
  230. position = 0;
  231. }
  232. void GDScriptTokenizer::set_cursor_position(int p_line, int p_column) {
  233. cursor_line = p_line;
  234. cursor_column = p_column;
  235. }
  236. void GDScriptTokenizer::set_multiline_mode(bool p_state) {
  237. multiline_mode = p_state;
  238. }
  239. void GDScriptTokenizer::push_expression_indented_block() {
  240. indent_stack_stack.push_back(indent_stack);
  241. }
  242. void GDScriptTokenizer::pop_expression_indented_block() {
  243. ERR_FAIL_COND(indent_stack_stack.size() == 0);
  244. indent_stack = indent_stack_stack.back()->get();
  245. indent_stack_stack.pop_back();
  246. }
  247. int GDScriptTokenizer::get_cursor_line() const {
  248. return cursor_line;
  249. }
  250. int GDScriptTokenizer::get_cursor_column() const {
  251. return cursor_column;
  252. }
  253. bool GDScriptTokenizer::is_past_cursor() const {
  254. if (line < cursor_line) {
  255. return false;
  256. }
  257. if (line > cursor_line) {
  258. return true;
  259. }
  260. if (column < cursor_column) {
  261. return false;
  262. }
  263. return true;
  264. }
  265. char32_t GDScriptTokenizer::_advance() {
  266. if (unlikely(_is_at_end())) {
  267. return '\0';
  268. }
  269. _current++;
  270. column++;
  271. position++;
  272. if (column > rightmost_column) {
  273. rightmost_column = column;
  274. }
  275. if (unlikely(_is_at_end())) {
  276. // Add extra newline even if it's not there, to satisfy the parser.
  277. newline(true);
  278. // Also add needed unindent.
  279. check_indent();
  280. }
  281. return _peek(-1);
  282. }
  283. void GDScriptTokenizer::push_paren(char32_t p_char) {
  284. paren_stack.push_back(p_char);
  285. }
  286. bool GDScriptTokenizer::pop_paren(char32_t p_expected) {
  287. if (paren_stack.is_empty()) {
  288. return false;
  289. }
  290. char32_t actual = paren_stack.back()->get();
  291. paren_stack.pop_back();
  292. return actual == p_expected;
  293. }
  294. GDScriptTokenizer::Token GDScriptTokenizer::pop_error() {
  295. Token error = error_stack.back()->get();
  296. error_stack.pop_back();
  297. return error;
  298. }
  299. GDScriptTokenizer::Token GDScriptTokenizer::make_token(Token::Type p_type) {
  300. Token token(p_type);
  301. token.start_line = start_line;
  302. token.end_line = line;
  303. token.start_column = start_column;
  304. token.end_column = column;
  305. token.leftmost_column = leftmost_column;
  306. token.rightmost_column = rightmost_column;
  307. token.source = String(_start, _current - _start);
  308. if (p_type != Token::ERROR && cursor_line > -1) {
  309. // Also count whitespace after token.
  310. int offset = 0;
  311. while (_peek(offset) == ' ' || _peek(offset) == '\t') {
  312. offset++;
  313. }
  314. int last_column = column + offset;
  315. // Check cursor position in token.
  316. if (start_line == line) {
  317. // Single line token.
  318. if (cursor_line == start_line && cursor_column >= start_column && cursor_column <= last_column) {
  319. token.cursor_position = cursor_column - start_column;
  320. if (cursor_column == start_column) {
  321. token.cursor_place = CURSOR_BEGINNING;
  322. } else if (cursor_column < column) {
  323. token.cursor_place = CURSOR_MIDDLE;
  324. } else {
  325. token.cursor_place = CURSOR_END;
  326. }
  327. }
  328. } else {
  329. // Multi line token.
  330. if (cursor_line == start_line && cursor_column >= start_column) {
  331. // Is in first line.
  332. token.cursor_position = cursor_column - start_column;
  333. if (cursor_column == start_column) {
  334. token.cursor_place = CURSOR_BEGINNING;
  335. } else {
  336. token.cursor_place = CURSOR_MIDDLE;
  337. }
  338. } else if (cursor_line == line && cursor_column <= last_column) {
  339. // Is in last line.
  340. token.cursor_position = cursor_column - start_column;
  341. if (cursor_column < column) {
  342. token.cursor_place = CURSOR_MIDDLE;
  343. } else {
  344. token.cursor_place = CURSOR_END;
  345. }
  346. } else if (cursor_line > start_line && cursor_line < line) {
  347. // Is in middle line.
  348. token.cursor_position = CURSOR_MIDDLE;
  349. }
  350. }
  351. }
  352. return token;
  353. }
  354. GDScriptTokenizer::Token GDScriptTokenizer::make_literal(const Variant &p_literal) {
  355. Token token = make_token(Token::LITERAL);
  356. token.literal = p_literal;
  357. return token;
  358. }
  359. GDScriptTokenizer::Token GDScriptTokenizer::make_identifier(const StringName &p_identifier) {
  360. Token identifier = make_token(Token::IDENTIFIER);
  361. identifier.literal = p_identifier;
  362. return identifier;
  363. }
  364. GDScriptTokenizer::Token GDScriptTokenizer::make_error(const String &p_message) {
  365. Token error = make_token(Token::ERROR);
  366. error.literal = p_message;
  367. return error;
  368. }
  369. void GDScriptTokenizer::push_error(const String &p_message) {
  370. Token error = make_error(p_message);
  371. error_stack.push_back(error);
  372. }
  373. void GDScriptTokenizer::push_error(const Token &p_error) {
  374. error_stack.push_back(p_error);
  375. }
  376. GDScriptTokenizer::Token GDScriptTokenizer::make_paren_error(char32_t p_paren) {
  377. if (paren_stack.is_empty()) {
  378. return make_error(vformat("Closing \"%c\" doesn't have an opening counterpart.", p_paren));
  379. }
  380. Token error = make_error(vformat("Closing \"%c\" doesn't match the opening \"%c\".", p_paren, paren_stack.back()->get()));
  381. paren_stack.pop_back(); // Remove opening one anyway.
  382. return error;
  383. }
  384. GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, Token::Type p_double_type) {
  385. const char32_t *next = _current + 1;
  386. int chars = 2; // Two already matched.
  387. // Test before consuming characters, since we don't want to consume more than needed.
  388. while (*next == p_test) {
  389. chars++;
  390. next++;
  391. }
  392. if (chars >= 7) {
  393. // It is a VCS conflict marker.
  394. while (chars > 1) {
  395. // Consume all characters (first was already consumed by scan()).
  396. _advance();
  397. chars--;
  398. }
  399. return make_token(Token::VCS_CONFLICT_MARKER);
  400. } else {
  401. // It is only a regular double character token, so we consume the second character.
  402. _advance();
  403. return make_token(p_double_type);
  404. }
  405. }
  406. GDScriptTokenizer::Token GDScriptTokenizer::annotation() {
  407. if (is_unicode_identifier_start(_peek())) {
  408. _advance(); // Consume start character.
  409. } else {
  410. push_error("Expected annotation identifier after \"@\".");
  411. }
  412. while (is_unicode_identifier_continue(_peek())) {
  413. // Consume all identifier characters.
  414. _advance();
  415. }
  416. Token annotation = make_token(Token::ANNOTATION);
  417. annotation.literal = StringName(annotation.source);
  418. return annotation;
  419. }
  420. #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
  421. KEYWORD_GROUP('a') \
  422. KEYWORD("as", Token::AS) \
  423. KEYWORD("and", Token::AND) \
  424. KEYWORD("assert", Token::ASSERT) \
  425. KEYWORD("await", Token::AWAIT) \
  426. KEYWORD_GROUP('b') \
  427. KEYWORD("break", Token::BREAK) \
  428. KEYWORD("breakpoint", Token::BREAKPOINT) \
  429. KEYWORD_GROUP('c') \
  430. KEYWORD("class", Token::CLASS) \
  431. KEYWORD("class_name", Token::CLASS_NAME) \
  432. KEYWORD("const", Token::CONST) \
  433. KEYWORD("continue", Token::CONTINUE) \
  434. KEYWORD_GROUP('e') \
  435. KEYWORD("elif", Token::ELIF) \
  436. KEYWORD("else", Token::ELSE) \
  437. KEYWORD("enum", Token::ENUM) \
  438. KEYWORD("extends", Token::EXTENDS) \
  439. KEYWORD_GROUP('f') \
  440. KEYWORD("for", Token::FOR) \
  441. KEYWORD("func", Token::FUNC) \
  442. KEYWORD_GROUP('i') \
  443. KEYWORD("if", Token::IF) \
  444. KEYWORD("in", Token::IN) \
  445. KEYWORD("is", Token::IS) \
  446. KEYWORD_GROUP('m') \
  447. KEYWORD("match", Token::MATCH) \
  448. KEYWORD_GROUP('n') \
  449. KEYWORD("namespace", Token::NAMESPACE) \
  450. KEYWORD("not", Token::NOT) \
  451. KEYWORD_GROUP('o') \
  452. KEYWORD("or", Token::OR) \
  453. KEYWORD_GROUP('p') \
  454. KEYWORD("pass", Token::PASS) \
  455. KEYWORD("preload", Token::PRELOAD) \
  456. KEYWORD_GROUP('r') \
  457. KEYWORD("return", Token::RETURN) \
  458. KEYWORD_GROUP('s') \
  459. KEYWORD("self", Token::SELF) \
  460. KEYWORD("signal", Token::SIGNAL) \
  461. KEYWORD("static", Token::STATIC) \
  462. KEYWORD("super", Token::SUPER) \
  463. KEYWORD_GROUP('t') \
  464. KEYWORD("trait", Token::TRAIT) \
  465. KEYWORD_GROUP('v') \
  466. KEYWORD("var", Token::VAR) \
  467. KEYWORD("void", Token::VOID) \
  468. KEYWORD_GROUP('w') \
  469. KEYWORD("while", Token::WHILE) \
  470. KEYWORD_GROUP('y') \
  471. KEYWORD("yield", Token::YIELD) \
  472. KEYWORD_GROUP('I') \
  473. KEYWORD("INF", Token::CONST_INF) \
  474. KEYWORD_GROUP('N') \
  475. KEYWORD("NAN", Token::CONST_NAN) \
  476. KEYWORD_GROUP('P') \
  477. KEYWORD("PI", Token::CONST_PI) \
  478. KEYWORD_GROUP('T') \
  479. KEYWORD("TAU", Token::CONST_TAU)
  480. #define MIN_KEYWORD_LENGTH 2
  481. #define MAX_KEYWORD_LENGTH 10
  482. #ifdef DEBUG_ENABLED
  483. void GDScriptTokenizer::make_keyword_list() {
  484. #define KEYWORD_LINE(keyword, token_type) keyword,
  485. #define KEYWORD_GROUP_IGNORE(group)
  486. keyword_list = {
  487. KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE)
  488. };
  489. #undef KEYWORD_LINE
  490. #undef KEYWORD_GROUP_IGNORE
  491. }
  492. #endif // DEBUG_ENABLED
  493. GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() {
  494. // Consume all identifier characters.
  495. while (is_unicode_identifier_continue(_peek())) {
  496. _advance();
  497. }
  498. int len = _current - _start;
  499. if (len == 1 && _peek(-1) == '_') {
  500. // Lone underscore.
  501. return make_token(Token::UNDERSCORE);
  502. }
  503. String name(_start, len);
  504. if (len < MIN_KEYWORD_LENGTH || len > MAX_KEYWORD_LENGTH) {
  505. // Cannot be a keyword, as the length doesn't match any.
  506. return make_identifier(name);
  507. }
  508. // Define some helper macros for the switch case.
  509. #define KEYWORD_GROUP_CASE(char) \
  510. break; \
  511. case char:
  512. #define KEYWORD(keyword, token_type) \
  513. { \
  514. const int keyword_length = sizeof(keyword) - 1; \
  515. static_assert(keyword_length <= MAX_KEYWORD_LENGTH, "There's a keyword longer than the defined maximum length"); \
  516. static_assert(keyword_length >= MIN_KEYWORD_LENGTH, "There's a keyword shorter than the defined minimum length"); \
  517. if (keyword_length == len && name == keyword) { \
  518. return make_token(token_type); \
  519. } \
  520. }
  521. // Find if it's a keyword.
  522. switch (_start[0]) {
  523. default:
  524. KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
  525. break;
  526. }
  527. // Check if it's a special literal
  528. if (len == 4) {
  529. if (name == "true") {
  530. return make_literal(true);
  531. } else if (name == "null") {
  532. return make_literal(Variant());
  533. }
  534. } else if (len == 5) {
  535. if (name == "false") {
  536. return make_literal(false);
  537. }
  538. }
  539. // Not a keyword, so must be an identifier.
  540. Token id = make_identifier(name);
  541. #ifdef DEBUG_ENABLED
  542. // Additional checks for identifiers but only in debug and if it's available in TextServer.
  543. if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
  544. int64_t confusable = TS->is_confusable(name, keyword_list);
  545. if (confusable >= 0) {
  546. push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
  547. }
  548. }
  549. #endif // DEBUG_ENABLED
  550. return id;
  551. #undef KEYWORD_GROUP_CASE
  552. #undef KEYWORD
  553. }
  554. #undef MAX_KEYWORD_LENGTH
  555. #undef MIN_KEYWORD_LENGTH
  556. #undef KEYWORDS
  557. void GDScriptTokenizer::newline(bool p_make_token) {
  558. // Don't overwrite previous newline, nor create if we want a line continuation.
  559. if (p_make_token && !pending_newline && !line_continuation) {
  560. Token newline(Token::NEWLINE);
  561. newline.start_line = line;
  562. newline.end_line = line;
  563. newline.start_column = column - 1;
  564. newline.end_column = column;
  565. newline.leftmost_column = newline.start_column;
  566. newline.rightmost_column = newline.end_column;
  567. pending_newline = true;
  568. last_newline = newline;
  569. }
  570. // Increment line/column counters.
  571. line++;
  572. column = 1;
  573. leftmost_column = 1;
  574. }
  575. GDScriptTokenizer::Token GDScriptTokenizer::number() {
  576. int base = 10;
  577. bool has_decimal = false;
  578. bool has_exponent = false;
  579. bool has_error = false;
  580. bool (*digit_check_func)(char32_t) = is_digit;
  581. if (_peek(-1) == '.') {
  582. has_decimal = true;
  583. } else if (_peek(-1) == '0') {
  584. if (_peek() == 'x') {
  585. // Hexadecimal.
  586. base = 16;
  587. digit_check_func = is_hex_digit;
  588. _advance();
  589. } else if (_peek() == 'b') {
  590. // Binary.
  591. base = 2;
  592. digit_check_func = is_binary_digit;
  593. _advance();
  594. }
  595. }
  596. // Allow '_' to be used in a number, for readability.
  597. bool previous_was_underscore = false;
  598. while (digit_check_func(_peek()) || is_underscore(_peek())) {
  599. if (is_underscore(_peek())) {
  600. if (previous_was_underscore) {
  601. Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
  602. error.start_column = column;
  603. error.leftmost_column = column;
  604. error.end_column = column + 1;
  605. error.rightmost_column = column + 1;
  606. push_error(error);
  607. }
  608. previous_was_underscore = true;
  609. } else {
  610. previous_was_underscore = false;
  611. }
  612. _advance();
  613. }
  614. // It might be a ".." token (instead of decimal point) so we check if it's not.
  615. if (_peek() == '.' && _peek(1) != '.') {
  616. if (base == 10 && !has_decimal) {
  617. has_decimal = true;
  618. } else if (base == 10) {
  619. Token error = make_error("Cannot use a decimal point twice in a number.");
  620. error.start_column = column;
  621. error.leftmost_column = column;
  622. error.end_column = column + 1;
  623. error.rightmost_column = column + 1;
  624. push_error(error);
  625. has_error = true;
  626. } else if (base == 16) {
  627. Token error = make_error("Cannot use a decimal point in a hexadecimal number.");
  628. error.start_column = column;
  629. error.leftmost_column = column;
  630. error.end_column = column + 1;
  631. error.rightmost_column = column + 1;
  632. push_error(error);
  633. has_error = true;
  634. } else {
  635. Token error = make_error("Cannot use a decimal point in a binary number.");
  636. error.start_column = column;
  637. error.leftmost_column = column;
  638. error.end_column = column + 1;
  639. error.rightmost_column = column + 1;
  640. push_error(error);
  641. has_error = true;
  642. }
  643. if (!has_error) {
  644. _advance();
  645. // Consume decimal digits.
  646. while (is_digit(_peek()) || is_underscore(_peek())) {
  647. _advance();
  648. }
  649. }
  650. }
  651. if (base == 10) {
  652. if (_peek() == 'e' || _peek() == 'E') {
  653. has_exponent = true;
  654. _advance();
  655. if (_peek() == '+' || _peek() == '-') {
  656. // Exponent sign.
  657. _advance();
  658. }
  659. // Consume exponent digits.
  660. if (!is_digit(_peek())) {
  661. Token error = make_error(R"(Expected exponent value after "e".)");
  662. error.start_column = column;
  663. error.leftmost_column = column;
  664. error.end_column = column + 1;
  665. error.rightmost_column = column + 1;
  666. push_error(error);
  667. }
  668. previous_was_underscore = false;
  669. while (is_digit(_peek()) || is_underscore(_peek())) {
  670. if (is_underscore(_peek())) {
  671. if (previous_was_underscore) {
  672. Token error = make_error(R"(Only one underscore can be used as a numeric separator.)");
  673. error.start_column = column;
  674. error.leftmost_column = column;
  675. error.end_column = column + 1;
  676. error.rightmost_column = column + 1;
  677. push_error(error);
  678. }
  679. previous_was_underscore = true;
  680. } else {
  681. previous_was_underscore = false;
  682. }
  683. _advance();
  684. }
  685. }
  686. }
  687. // Detect extra decimal point.
  688. if (!has_error && has_decimal && _peek() == '.' && _peek(1) != '.') {
  689. Token error = make_error("Cannot use a decimal point twice in a number.");
  690. error.start_column = column;
  691. error.leftmost_column = column;
  692. error.end_column = column + 1;
  693. error.rightmost_column = column + 1;
  694. push_error(error);
  695. has_error = true;
  696. } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) {
  697. // Letter at the end of the number.
  698. push_error("Invalid numeric notation.");
  699. }
  700. // Create a string with the whole number.
  701. int len = _current - _start;
  702. String number = String(_start, len).replace("_", "");
  703. // Convert to the appropriate literal type.
  704. if (base == 16) {
  705. int64_t value = number.hex_to_int();
  706. return make_literal(value);
  707. } else if (base == 2) {
  708. int64_t value = number.bin_to_int();
  709. return make_literal(value);
  710. } else if (has_decimal || has_exponent) {
  711. double value = number.to_float();
  712. return make_literal(value);
  713. } else {
  714. int64_t value = number.to_int();
  715. return make_literal(value);
  716. }
  717. }
  718. GDScriptTokenizer::Token GDScriptTokenizer::string() {
  719. enum StringType {
  720. STRING_REGULAR,
  721. STRING_NAME,
  722. STRING_NODEPATH,
  723. };
  724. bool is_multiline = false;
  725. StringType type = STRING_REGULAR;
  726. if (_peek(-1) == '&') {
  727. type = STRING_NAME;
  728. _advance();
  729. } else if (_peek(-1) == '^') {
  730. type = STRING_NODEPATH;
  731. _advance();
  732. }
  733. char32_t quote_char = _peek(-1);
  734. if (_peek() == quote_char && _peek(1) == quote_char) {
  735. is_multiline = true;
  736. // Consume all quotes.
  737. _advance();
  738. _advance();
  739. }
  740. String result;
  741. char32_t prev = 0;
  742. int prev_pos = 0;
  743. for (;;) {
  744. // Consume actual string.
  745. if (_is_at_end()) {
  746. return make_error("Unterminated string.");
  747. }
  748. char32_t ch = _peek();
  749. if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
  750. Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
  751. error.start_column = column;
  752. error.leftmost_column = error.start_column;
  753. error.end_column = column + 1;
  754. error.rightmost_column = error.end_column;
  755. push_error(error);
  756. }
  757. if (ch == '\\') {
  758. // Escape pattern.
  759. _advance();
  760. if (_is_at_end()) {
  761. return make_error("Unterminated string.");
  762. }
  763. // Grab escape character.
  764. char32_t code = _peek();
  765. _advance();
  766. if (_is_at_end()) {
  767. return make_error("Unterminated string.");
  768. }
  769. char32_t escaped = 0;
  770. bool valid_escape = true;
  771. switch (code) {
  772. case 'a':
  773. escaped = '\a';
  774. break;
  775. case 'b':
  776. escaped = '\b';
  777. break;
  778. case 'f':
  779. escaped = '\f';
  780. break;
  781. case 'n':
  782. escaped = '\n';
  783. break;
  784. case 'r':
  785. escaped = '\r';
  786. break;
  787. case 't':
  788. escaped = '\t';
  789. break;
  790. case 'v':
  791. escaped = '\v';
  792. break;
  793. case '\'':
  794. escaped = '\'';
  795. break;
  796. case '\"':
  797. escaped = '\"';
  798. break;
  799. case '\\':
  800. escaped = '\\';
  801. break;
  802. case 'U':
  803. case 'u': {
  804. // Hexadecimal sequence.
  805. int hex_len = (code == 'U') ? 6 : 4;
  806. for (int j = 0; j < hex_len; j++) {
  807. if (_is_at_end()) {
  808. return make_error("Unterminated string.");
  809. }
  810. char32_t digit = _peek();
  811. char32_t value = 0;
  812. if (is_digit(digit)) {
  813. value = digit - '0';
  814. } else if (digit >= 'a' && digit <= 'f') {
  815. value = digit - 'a';
  816. value += 10;
  817. } else if (digit >= 'A' && digit <= 'F') {
  818. value = digit - 'A';
  819. value += 10;
  820. } else {
  821. // Make error, but keep parsing the string.
  822. Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
  823. error.start_column = column;
  824. error.leftmost_column = error.start_column;
  825. error.end_column = column + 1;
  826. error.rightmost_column = error.end_column;
  827. push_error(error);
  828. valid_escape = false;
  829. break;
  830. }
  831. escaped <<= 4;
  832. escaped |= value;
  833. _advance();
  834. }
  835. } break;
  836. case '\r':
  837. if (_peek() != '\n') {
  838. // Carriage return without newline in string. (???)
  839. // Just add it to the string and keep going.
  840. result += ch;
  841. _advance();
  842. break;
  843. }
  844. [[fallthrough]];
  845. case '\n':
  846. // Escaping newline.
  847. newline(false);
  848. valid_escape = false; // Don't add to the string.
  849. break;
  850. default:
  851. Token error = make_error("Invalid escape in string.");
  852. error.start_column = column - 2;
  853. error.leftmost_column = error.start_column;
  854. push_error(error);
  855. valid_escape = false;
  856. break;
  857. }
  858. // Parse UTF-16 pair.
  859. if (valid_escape) {
  860. if ((escaped & 0xfffffc00) == 0xd800) {
  861. if (prev == 0) {
  862. prev = escaped;
  863. prev_pos = column - 2;
  864. continue;
  865. } else {
  866. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  867. error.start_column = column - 2;
  868. error.leftmost_column = error.start_column;
  869. push_error(error);
  870. valid_escape = false;
  871. prev = 0;
  872. }
  873. } else if ((escaped & 0xfffffc00) == 0xdc00) {
  874. if (prev == 0) {
  875. Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate");
  876. error.start_column = column - 2;
  877. error.leftmost_column = error.start_column;
  878. push_error(error);
  879. valid_escape = false;
  880. } else {
  881. escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
  882. prev = 0;
  883. }
  884. }
  885. if (prev != 0) {
  886. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  887. error.start_column = prev_pos;
  888. error.leftmost_column = error.start_column;
  889. push_error(error);
  890. prev = 0;
  891. }
  892. }
  893. if (valid_escape) {
  894. result += escaped;
  895. }
  896. } else if (ch == quote_char) {
  897. if (prev != 0) {
  898. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  899. error.start_column = prev_pos;
  900. error.leftmost_column = error.start_column;
  901. push_error(error);
  902. prev = 0;
  903. }
  904. _advance();
  905. if (is_multiline) {
  906. if (_peek() == quote_char && _peek(1) == quote_char) {
  907. // Ended the multiline string. Consume all quotes.
  908. _advance();
  909. _advance();
  910. break;
  911. } else {
  912. // Not a multiline string termination, add consumed quote.
  913. result += quote_char;
  914. }
  915. } else {
  916. // Ended single-line string.
  917. break;
  918. }
  919. } else {
  920. if (prev != 0) {
  921. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  922. error.start_column = prev_pos;
  923. error.leftmost_column = error.start_column;
  924. push_error(error);
  925. prev = 0;
  926. }
  927. result += ch;
  928. _advance();
  929. if (ch == '\n') {
  930. newline(false);
  931. }
  932. }
  933. }
  934. if (prev != 0) {
  935. Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
  936. error.start_column = prev_pos;
  937. error.leftmost_column = error.start_column;
  938. push_error(error);
  939. prev = 0;
  940. }
  941. // Make the literal.
  942. Variant string;
  943. switch (type) {
  944. case STRING_NAME:
  945. string = StringName(result);
  946. break;
  947. case STRING_NODEPATH:
  948. string = NodePath(result);
  949. break;
  950. case STRING_REGULAR:
  951. string = result;
  952. break;
  953. }
  954. return make_literal(string);
  955. }
  956. void GDScriptTokenizer::check_indent() {
  957. ERR_FAIL_COND_MSG(column != 1, "Checking tokenizer indentation in the middle of a line.");
  958. if (_is_at_end()) {
  959. // Send dedents for every indent level.
  960. pending_indents -= indent_level();
  961. indent_stack.clear();
  962. return;
  963. }
  964. for (;;) {
  965. char32_t current_indent_char = _peek();
  966. int indent_count = 0;
  967. if (current_indent_char != ' ' && current_indent_char != '\t' && current_indent_char != '\r' && current_indent_char != '\n' && current_indent_char != '#') {
  968. // First character of the line is not whitespace, so we clear all indentation levels.
  969. // Unless we are in a continuation or in multiline mode (inside expression).
  970. if (line_continuation || multiline_mode) {
  971. return;
  972. }
  973. pending_indents -= indent_level();
  974. indent_stack.clear();
  975. return;
  976. }
  977. if (_peek() == '\r') {
  978. _advance();
  979. if (_peek() != '\n') {
  980. push_error("Stray carriage return character in source code.");
  981. }
  982. }
  983. if (_peek() == '\n') {
  984. // Empty line, keep going.
  985. _advance();
  986. newline(false);
  987. continue;
  988. }
  989. // Check indent level.
  990. bool mixed = false;
  991. while (!_is_at_end()) {
  992. char32_t space = _peek();
  993. if (space == '\t') {
  994. // Consider individual tab columns.
  995. column += tab_size - 1;
  996. indent_count += tab_size;
  997. } else if (space == ' ') {
  998. indent_count += 1;
  999. } else {
  1000. break;
  1001. }
  1002. mixed = mixed || space != current_indent_char;
  1003. _advance();
  1004. }
  1005. if (mixed) {
  1006. Token error = make_error("Mixed use of tabs and spaces for indentation.");
  1007. error.start_line = line;
  1008. error.start_column = 1;
  1009. error.leftmost_column = 1;
  1010. error.rightmost_column = column;
  1011. push_error(error);
  1012. }
  1013. if (_is_at_end()) {
  1014. // Reached the end with an empty line, so just dedent as much as needed.
  1015. pending_indents -= indent_level();
  1016. indent_stack.clear();
  1017. return;
  1018. }
  1019. if (_peek() == '\r') {
  1020. _advance();
  1021. if (_peek() != '\n') {
  1022. push_error("Stray carriage return character in source code.");
  1023. }
  1024. }
  1025. if (_peek() == '\n') {
  1026. // Empty line, keep going.
  1027. _advance();
  1028. newline(false);
  1029. continue;
  1030. }
  1031. if (_peek() == '#') {
  1032. // Comment. Advance to the next line.
  1033. #ifdef TOOLS_ENABLED
  1034. String comment;
  1035. while (_peek() != '\n' && !_is_at_end()) {
  1036. comment += _advance();
  1037. }
  1038. comments[line] = CommentData(comment, true);
  1039. #else
  1040. while (_peek() != '\n' && !_is_at_end()) {
  1041. _advance();
  1042. }
  1043. #endif // TOOLS_ENABLED
  1044. if (_is_at_end()) {
  1045. // Reached the end with an empty line, so just dedent as much as needed.
  1046. pending_indents -= indent_level();
  1047. indent_stack.clear();
  1048. return;
  1049. }
  1050. _advance(); // Consume '\n'.
  1051. newline(false);
  1052. continue;
  1053. }
  1054. if (line_continuation || multiline_mode) {
  1055. // We cleared up all the whitespace at the beginning of the line.
  1056. // But if this is a continuation or multiline mode and we don't want any indentation change.
  1057. return;
  1058. }
  1059. // Check if indentation character is consistent.
  1060. if (indent_char == '\0') {
  1061. // First time indenting, choose character now.
  1062. indent_char = current_indent_char;
  1063. } else if (current_indent_char != indent_char) {
  1064. Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
  1065. _get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
  1066. error.start_line = line;
  1067. error.start_column = 1;
  1068. error.leftmost_column = 1;
  1069. error.rightmost_column = column;
  1070. push_error(error);
  1071. }
  1072. // Now we can do actual indentation changes.
  1073. // Check if indent or dedent.
  1074. int previous_indent = 0;
  1075. if (indent_level() > 0) {
  1076. previous_indent = indent_stack.back()->get();
  1077. }
  1078. if (indent_count == previous_indent) {
  1079. // No change in indentation.
  1080. return;
  1081. }
  1082. if (indent_count > previous_indent) {
  1083. // Indentation increased.
  1084. indent_stack.push_back(indent_count);
  1085. pending_indents++;
  1086. } else {
  1087. // Indentation decreased (dedent).
  1088. if (indent_level() == 0) {
  1089. push_error("Tokenizer bug: trying to dedent without previous indent.");
  1090. return;
  1091. }
  1092. while (indent_level() > 0 && indent_stack.back()->get() > indent_count) {
  1093. indent_stack.pop_back();
  1094. pending_indents--;
  1095. }
  1096. if ((indent_level() > 0 && indent_stack.back()->get() != indent_count) || (indent_level() == 0 && indent_count != 0)) {
  1097. // Mismatched indentation alignment.
  1098. Token error = make_error("Unindent doesn't match the previous indentation level.");
  1099. error.start_line = line;
  1100. error.start_column = 1;
  1101. error.leftmost_column = 1;
  1102. error.end_column = column + 1;
  1103. error.rightmost_column = column + 1;
  1104. push_error(error);
  1105. // Still, we'll be lenient and keep going, so keep this level in the stack.
  1106. indent_stack.push_back(indent_count);
  1107. }
  1108. }
  1109. break; // Get out of the loop in any case.
  1110. }
  1111. }
  1112. String GDScriptTokenizer::_get_indent_char_name(char32_t ch) {
  1113. ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String(&ch, 1).c_escape());
  1114. return ch == ' ' ? "space" : "tab";
  1115. }
  1116. void GDScriptTokenizer::_skip_whitespace() {
  1117. if (pending_indents != 0) {
  1118. // Still have some indent/dedent tokens to give.
  1119. return;
  1120. }
  1121. bool is_bol = column == 1; // Beginning of line.
  1122. if (is_bol) {
  1123. check_indent();
  1124. return;
  1125. }
  1126. for (;;) {
  1127. char32_t c = _peek();
  1128. switch (c) {
  1129. case ' ':
  1130. _advance();
  1131. break;
  1132. case '\t':
  1133. _advance();
  1134. // Consider individual tab columns.
  1135. column += tab_size - 1;
  1136. break;
  1137. case '\r':
  1138. _advance(); // Consume either way.
  1139. if (_peek() != '\n') {
  1140. push_error("Stray carriage return character in source code.");
  1141. return;
  1142. }
  1143. break;
  1144. case '\n':
  1145. _advance();
  1146. newline(!is_bol); // Don't create new line token if line is empty.
  1147. check_indent();
  1148. break;
  1149. case '#': {
  1150. // Comment.
  1151. #ifdef TOOLS_ENABLED
  1152. String comment;
  1153. while (_peek() != '\n' && !_is_at_end()) {
  1154. comment += _advance();
  1155. }
  1156. comments[line] = CommentData(comment, is_bol);
  1157. #else
  1158. while (_peek() != '\n' && !_is_at_end()) {
  1159. _advance();
  1160. }
  1161. #endif // TOOLS_ENABLED
  1162. if (_is_at_end()) {
  1163. return;
  1164. }
  1165. _advance(); // Consume '\n'
  1166. newline(!is_bol);
  1167. check_indent();
  1168. } break;
  1169. default:
  1170. return;
  1171. }
  1172. }
  1173. }
  1174. GDScriptTokenizer::Token GDScriptTokenizer::scan() {
  1175. if (has_error()) {
  1176. return pop_error();
  1177. }
  1178. _skip_whitespace();
  1179. if (pending_newline) {
  1180. pending_newline = false;
  1181. if (!multiline_mode) {
  1182. // Don't return newline tokens on multiline mode.
  1183. return last_newline;
  1184. }
  1185. }
  1186. // Check for potential errors after skipping whitespace().
  1187. if (has_error()) {
  1188. return pop_error();
  1189. }
  1190. _start = _current;
  1191. start_line = line;
  1192. start_column = column;
  1193. leftmost_column = column;
  1194. rightmost_column = column;
  1195. if (pending_indents != 0) {
  1196. // Adjust position for indent.
  1197. _start -= start_column - 1;
  1198. start_column = 1;
  1199. leftmost_column = 1;
  1200. if (pending_indents > 0) {
  1201. // Indents.
  1202. pending_indents--;
  1203. return make_token(Token::INDENT);
  1204. } else {
  1205. // Dedents.
  1206. pending_indents++;
  1207. Token dedent = make_token(Token::DEDENT);
  1208. dedent.end_column += 1;
  1209. dedent.rightmost_column += 1;
  1210. return dedent;
  1211. }
  1212. }
  1213. if (_is_at_end()) {
  1214. return make_token(Token::TK_EOF);
  1215. }
  1216. const char32_t c = _advance();
  1217. if (c == '\\') {
  1218. // Line continuation with backslash.
  1219. if (_peek() == '\r') {
  1220. if (_peek(1) != '\n') {
  1221. return make_error("Unexpected carriage return character.");
  1222. }
  1223. _advance();
  1224. }
  1225. if (_peek() != '\n') {
  1226. return make_error("Expected new line after \"\\\".");
  1227. }
  1228. _advance();
  1229. newline(false);
  1230. line_continuation = true;
  1231. return scan(); // Recurse to get next token.
  1232. }
  1233. line_continuation = false;
  1234. if (is_digit(c)) {
  1235. return number();
  1236. } else if (is_unicode_identifier_start(c)) {
  1237. return potential_identifier();
  1238. }
  1239. switch (c) {
  1240. // String literals.
  1241. case '"':
  1242. case '\'':
  1243. return string();
  1244. // Annotation.
  1245. case '@':
  1246. return annotation();
  1247. // Single characters.
  1248. case '~':
  1249. return make_token(Token::TILDE);
  1250. case ',':
  1251. return make_token(Token::COMMA);
  1252. case ':':
  1253. return make_token(Token::COLON);
  1254. case ';':
  1255. return make_token(Token::SEMICOLON);
  1256. case '$':
  1257. return make_token(Token::DOLLAR);
  1258. case '?':
  1259. return make_token(Token::QUESTION_MARK);
  1260. case '`':
  1261. return make_token(Token::BACKTICK);
  1262. // Parens.
  1263. case '(':
  1264. push_paren('(');
  1265. return make_token(Token::PARENTHESIS_OPEN);
  1266. case '[':
  1267. push_paren('[');
  1268. return make_token(Token::BRACKET_OPEN);
  1269. case '{':
  1270. push_paren('{');
  1271. return make_token(Token::BRACE_OPEN);
  1272. case ')':
  1273. if (!pop_paren('(')) {
  1274. return make_paren_error(c);
  1275. }
  1276. return make_token(Token::PARENTHESIS_CLOSE);
  1277. case ']':
  1278. if (!pop_paren('[')) {
  1279. return make_paren_error(c);
  1280. }
  1281. return make_token(Token::BRACKET_CLOSE);
  1282. case '}':
  1283. if (!pop_paren('{')) {
  1284. return make_paren_error(c);
  1285. }
  1286. return make_token(Token::BRACE_CLOSE);
  1287. // Double characters.
  1288. case '!':
  1289. if (_peek() == '=') {
  1290. _advance();
  1291. return make_token(Token::BANG_EQUAL);
  1292. } else {
  1293. return make_token(Token::BANG);
  1294. }
  1295. case '.':
  1296. if (_peek() == '.') {
  1297. _advance();
  1298. return make_token(Token::PERIOD_PERIOD);
  1299. } else if (is_digit(_peek())) {
  1300. // Number starting with '.'.
  1301. return number();
  1302. } else {
  1303. return make_token(Token::PERIOD);
  1304. }
  1305. case '+':
  1306. if (_peek() == '=') {
  1307. _advance();
  1308. return make_token(Token::PLUS_EQUAL);
  1309. } else {
  1310. return make_token(Token::PLUS);
  1311. }
  1312. case '-':
  1313. if (_peek() == '=') {
  1314. _advance();
  1315. return make_token(Token::MINUS_EQUAL);
  1316. } else if (_peek() == '>') {
  1317. _advance();
  1318. return make_token(Token::FORWARD_ARROW);
  1319. } else {
  1320. return make_token(Token::MINUS);
  1321. }
  1322. case '*':
  1323. if (_peek() == '=') {
  1324. _advance();
  1325. return make_token(Token::STAR_EQUAL);
  1326. } else if (_peek() == '*') {
  1327. if (_peek(1) == '=') {
  1328. _advance();
  1329. _advance(); // Advance both '*' and '='
  1330. return make_token(Token::STAR_STAR_EQUAL);
  1331. }
  1332. _advance();
  1333. return make_token(Token::STAR_STAR);
  1334. } else {
  1335. return make_token(Token::STAR);
  1336. }
  1337. case '/':
  1338. if (_peek() == '=') {
  1339. _advance();
  1340. return make_token(Token::SLASH_EQUAL);
  1341. } else {
  1342. return make_token(Token::SLASH);
  1343. }
  1344. case '%':
  1345. if (_peek() == '=') {
  1346. _advance();
  1347. return make_token(Token::PERCENT_EQUAL);
  1348. } else {
  1349. return make_token(Token::PERCENT);
  1350. }
  1351. case '^':
  1352. if (_peek() == '=') {
  1353. _advance();
  1354. return make_token(Token::CARET_EQUAL);
  1355. } else if (_peek() == '"' || _peek() == '\'') {
  1356. // Node path
  1357. return string();
  1358. } else {
  1359. return make_token(Token::CARET);
  1360. }
  1361. case '&':
  1362. if (_peek() == '&') {
  1363. _advance();
  1364. return make_token(Token::AMPERSAND_AMPERSAND);
  1365. } else if (_peek() == '=') {
  1366. _advance();
  1367. return make_token(Token::AMPERSAND_EQUAL);
  1368. } else if (_peek() == '"' || _peek() == '\'') {
  1369. // String Name
  1370. return string();
  1371. } else {
  1372. return make_token(Token::AMPERSAND);
  1373. }
  1374. case '|':
  1375. if (_peek() == '|') {
  1376. _advance();
  1377. return make_token(Token::PIPE_PIPE);
  1378. } else if (_peek() == '=') {
  1379. _advance();
  1380. return make_token(Token::PIPE_EQUAL);
  1381. } else {
  1382. return make_token(Token::PIPE);
  1383. }
  1384. // Potential VCS conflict markers.
  1385. case '=':
  1386. if (_peek() == '=') {
  1387. return check_vcs_marker('=', Token::EQUAL_EQUAL);
  1388. } else {
  1389. return make_token(Token::EQUAL);
  1390. }
  1391. case '<':
  1392. if (_peek() == '=') {
  1393. _advance();
  1394. return make_token(Token::LESS_EQUAL);
  1395. } else if (_peek() == '<') {
  1396. if (_peek(1) == '=') {
  1397. _advance();
  1398. _advance(); // Advance both '<' and '='
  1399. return make_token(Token::LESS_LESS_EQUAL);
  1400. } else {
  1401. return check_vcs_marker('<', Token::LESS_LESS);
  1402. }
  1403. } else {
  1404. return make_token(Token::LESS);
  1405. }
  1406. case '>':
  1407. if (_peek() == '=') {
  1408. _advance();
  1409. return make_token(Token::GREATER_EQUAL);
  1410. } else if (_peek() == '>') {
  1411. if (_peek(1) == '=') {
  1412. _advance();
  1413. _advance(); // Advance both '>' and '='
  1414. return make_token(Token::GREATER_GREATER_EQUAL);
  1415. } else {
  1416. return check_vcs_marker('>', Token::GREATER_GREATER);
  1417. }
  1418. } else {
  1419. return make_token(Token::GREATER);
  1420. }
  1421. default:
  1422. if (is_whitespace(c)) {
  1423. return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast<int32_t>(c)));
  1424. } else {
  1425. return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1)));
  1426. }
  1427. }
  1428. }
  1429. GDScriptTokenizer::GDScriptTokenizer() {
  1430. #ifdef TOOLS_ENABLED
  1431. if (EditorSettings::get_singleton()) {
  1432. tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size");
  1433. }
  1434. #endif // TOOLS_ENABLED
  1435. #ifdef DEBUG_ENABLED
  1436. make_keyword_list();
  1437. #endif // DEBUG_ENABLED
  1438. }