Lexer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. // Copyright (c) 2008 Roberto Raggi <[email protected]>
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. //
  10. // The above copyright notice and this permission notice shall be included in
  11. // all copies or substantial portions of the Software.
  12. //
  13. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. // THE SOFTWARE.
  20. #include "Lexer.h"
  21. #include "Control.h"
  22. #include "TranslationUnit.h"
  23. #include "Literals.h"
  24. #include "cppassert.h"
  25. #include <cctype>
  26. using namespace CPlusPlus;
  27. /*!
  28. \class Lexer
  29. \brief The Lexer generates tokens from an UTF-8 encoded source text.
  30. \sa Token
  31. */
  32. /*!
  33. \fn static void Lexer::yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar, unsigned &utf16charCounter)
  34. Process a single unicode code point in an UTF-8 encoded source.
  35. \a currentSourceChar points to the UTF-8 encoded source.
  36. \a yychar must be the byte pointed to by \a currentSourceChar.
  37. Points \a currentSourceChar to the byte of the next code point
  38. and modifies \a yychar to the value pointed by the updated
  39. \a currentSourceChar. \a utf16charCounter will be incremented by
  40. the number of UTF-16 code units that were needed for that code
  41. point.
  42. */
  43. Lexer::Lexer(TranslationUnit *unit)
  44. : _translationUnit(unit),
  45. _control(unit->control()),
  46. _state(0),
  47. _flags(0),
  48. _currentLine(1)
  49. {
  50. f._scanKeywords = true;
  51. setSource(_translationUnit->firstSourceChar(),
  52. _translationUnit->lastSourceChar());
  53. }
  54. Lexer::Lexer(const char *firstChar, const char *lastChar)
  55. : _translationUnit(0),
  56. _control(0),
  57. _state(0),
  58. _flags(0),
  59. _currentLine(1)
  60. {
  61. f._scanKeywords = true;
  62. setSource(firstChar, lastChar);
  63. }
  64. Lexer::~Lexer()
  65. { }
  66. TranslationUnit *Lexer::translationUnit() const
  67. { return _translationUnit; }
  68. void Lexer::setSource(const char *firstChar, const char *lastChar)
  69. {
  70. _firstChar = firstChar;
  71. _lastChar = lastChar;
  72. _currentChar = _firstChar - 1;
  73. _currentCharUtf16 = -1;
  74. _tokenStart = _currentChar;
  75. _yychar = '\n';
  76. }
  77. void Lexer::setStartWithNewline(bool enabled)
  78. {
  79. if (enabled)
  80. _yychar = '\n';
  81. else
  82. _yychar = ' ';
  83. }
  84. int Lexer::state() const
  85. { return _state; }
  86. void Lexer::setState(int state)
  87. { _state = state; }
  88. bool Lexer::scanCommentTokens() const
  89. { return f._scanCommentTokens; }
  90. void Lexer::setScanCommentTokens(bool onoff)
  91. { f._scanCommentTokens = onoff; }
  92. bool Lexer::scanKeywords() const
  93. { return f._scanKeywords; }
  94. void Lexer::setScanKeywords(bool onoff)
  95. { f._scanKeywords = onoff; }
  96. void Lexer::setScanAngleStringLiteralTokens(bool onoff)
  97. { f._scanAngleStringLiteralTokens = onoff; }
  98. void Lexer::pushLineStartOffset()
  99. {
  100. ++_currentLine;
  101. if (_translationUnit)
  102. _translationUnit->pushLineOffset(_currentCharUtf16);
  103. }
  104. void Lexer::scan(Token *tok)
  105. {
  106. tok->reset();
  107. scan_helper(tok);
  108. tok->f.bytes = _currentChar - _tokenStart;
  109. tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
  110. }
  111. void Lexer::scan_helper(Token *tok)
  112. {
  113. again:
  114. while (_yychar && std::isspace(_yychar)) {
  115. if (_yychar == '\n') {
  116. tok->f.joined = s._newlineExpected;
  117. tok->f.newline = !s._newlineExpected;
  118. if (s._newlineExpected) {
  119. s._newlineExpected = false;
  120. } else {
  121. switch (s._tokenKind) {
  122. case T_EOF_SYMBOL:
  123. case T_COMMENT:
  124. case T_DOXY_COMMENT:
  125. break; // multiline tokens, don't break on newline
  126. default: // Strings and C++ comments
  127. _state = 0;
  128. }
  129. }
  130. } else {
  131. tok->f.whitespace = true;
  132. }
  133. yyinp();
  134. }
  135. if (! _translationUnit)
  136. tok->lineno = _currentLine;
  137. _tokenStart = _currentChar;
  138. tok->byteOffset = _currentChar - _firstChar;
  139. _tokenStartUtf16 = _currentCharUtf16;
  140. tok->utf16charOffset = _currentCharUtf16;
  141. if (_yychar) {
  142. s._newlineExpected = false;
  143. } else if (s._tokenKind) {
  144. tok->f.kind = T_EOF_SYMBOL;
  145. return;
  146. }
  147. switch (s._tokenKind) {
  148. case T_EOF_SYMBOL:
  149. break;
  150. case T_COMMENT:
  151. case T_DOXY_COMMENT: {
  152. const int originalKind = s._tokenKind;
  153. while (_yychar) {
  154. if (_yychar != '*')
  155. yyinp();
  156. else {
  157. yyinp();
  158. if (_yychar == '/') {
  159. yyinp();
  160. _state = 0;
  161. break;
  162. }
  163. }
  164. }
  165. if (! f._scanCommentTokens)
  166. goto again;
  167. tok->f.kind = originalKind;
  168. return; // done
  169. }
  170. case T_CPP_COMMENT:
  171. case T_CPP_DOXY_COMMENT: {
  172. const Kind originalKind = (Kind)s._tokenKind;
  173. tok->f.joined = true;
  174. if (f._scanCommentTokens)
  175. tok->f.kind = originalKind;
  176. _state = 0;
  177. scanCppComment(originalKind);
  178. return;
  179. }
  180. default: // Strings
  181. tok->f.joined = true;
  182. tok->f.kind = s._tokenKind;
  183. _state = 0;
  184. scanUntilQuote(tok, '"');
  185. return;
  186. }
  187. if (! _yychar) {
  188. tok->f.kind = T_EOF_SYMBOL;
  189. return;
  190. }
  191. unsigned char ch = _yychar;
  192. yyinp();
  193. switch (ch) {
  194. case '\\':
  195. s._newlineExpected = true;
  196. goto again;
  197. case '"':
  198. scanStringLiteral(tok);
  199. break;
  200. case '\'':
  201. scanCharLiteral(tok);
  202. break;
  203. case '{':
  204. tok->f.kind = T_LBRACE;
  205. break;
  206. case '}':
  207. tok->f.kind = T_RBRACE;
  208. break;
  209. case '[':
  210. tok->f.kind = T_LBRACKET;
  211. break;
  212. case ']':
  213. tok->f.kind = T_RBRACKET;
  214. break;
  215. case '#':
  216. if (_yychar == '#') {
  217. tok->f.kind = T_POUND_POUND;
  218. yyinp();
  219. } else {
  220. tok->f.kind = T_POUND;
  221. }
  222. break;
  223. case '(':
  224. tok->f.kind = T_LPAREN;
  225. break;
  226. case ')':
  227. tok->f.kind = T_RPAREN;
  228. break;
  229. case ';':
  230. tok->f.kind = T_SEMICOLON;
  231. break;
  232. case ':':
  233. if (_yychar == ':') {
  234. yyinp();
  235. tok->f.kind = T_COLON_COLON;
  236. } else if (_yychar == '>') {
  237. yyinp();
  238. tok->f.kind = T_RBRACKET;
  239. } else {
  240. tok->f.kind = T_COLON;
  241. }
  242. break;
  243. case '.':
  244. if (_yychar == '*') {
  245. yyinp();
  246. tok->f.kind = T_DOT_STAR;
  247. } else if (_yychar == '.') {
  248. yyinp();
  249. // ### CPP_CHECK(_yychar);
  250. if (_yychar == '.') {
  251. yyinp();
  252. tok->f.kind = T_DOT_DOT_DOT;
  253. } else {
  254. tok->f.kind = T_ERROR;
  255. }
  256. } else if (std::isdigit(_yychar)) {
  257. const char *yytext = _currentChar - 2;
  258. do {
  259. if (_yychar == 'e' || _yychar == 'E') {
  260. yyinp();
  261. if (_yychar == '-' || _yychar == '+') {
  262. yyinp();
  263. // ### CPP_CHECK(std::isdigit(_yychar));
  264. }
  265. } else if (std::isalnum(_yychar) || _yychar == '.') {
  266. yyinp();
  267. } else {
  268. break;
  269. }
  270. } while (_yychar);
  271. int yylen = _currentChar - yytext;
  272. tok->f.kind = T_NUMERIC_LITERAL;
  273. if (control())
  274. tok->number = control()->numericLiteral(yytext, yylen);
  275. } else {
  276. tok->f.kind = T_DOT;
  277. }
  278. break;
  279. case '?':
  280. if (_yychar == '?') {
  281. yyinp();
  282. if (_yychar == '(') {
  283. yyinp();
  284. tok->f.kind = T_LBRACKET;
  285. } else if (_yychar == ')') {
  286. yyinp();
  287. tok->f.kind = T_RBRACKET;
  288. } else if (_yychar == '<') {
  289. yyinp();
  290. tok->f.kind = T_LBRACE;
  291. } else if (_yychar == '>') {
  292. yyinp();
  293. tok->f.kind = T_RBRACE;
  294. }
  295. } else {
  296. tok->f.kind = T_QUESTION;
  297. }
  298. break;
  299. case '+':
  300. if (_yychar == '+') {
  301. yyinp();
  302. tok->f.kind = T_PLUS_PLUS;
  303. } else if (_yychar == '=') {
  304. yyinp();
  305. tok->f.kind = T_PLUS_EQUAL;
  306. } else {
  307. tok->f.kind = T_PLUS;
  308. }
  309. break;
  310. case '-':
  311. if (_yychar == '-') {
  312. yyinp();
  313. tok->f.kind = T_MINUS_MINUS;
  314. } else if (_yychar == '=') {
  315. yyinp();
  316. tok->f.kind = T_MINUS_EQUAL;
  317. } else if (_yychar == '>') {
  318. yyinp();
  319. if (_yychar == '*') {
  320. yyinp();
  321. tok->f.kind = T_ARROW_STAR;
  322. } else {
  323. tok->f.kind = T_ARROW;
  324. }
  325. } else {
  326. tok->f.kind = T_MINUS;
  327. }
  328. break;
  329. case '*':
  330. if (_yychar == '=') {
  331. yyinp();
  332. tok->f.kind = T_STAR_EQUAL;
  333. } else {
  334. tok->f.kind = T_STAR;
  335. }
  336. break;
  337. case '/':
  338. if (_yychar == '/') {
  339. yyinp();
  340. Kind commentType = T_CPP_COMMENT;
  341. if (_yychar == '/' || _yychar == '!') {
  342. yyinp();
  343. commentType = T_CPP_DOXY_COMMENT;
  344. }
  345. scanCppComment(commentType);
  346. if (! f._scanCommentTokens)
  347. goto again;
  348. tok->f.kind = commentType;
  349. } else if (_yychar == '*') {
  350. yyinp();
  351. Kind commentKind = T_COMMENT;
  352. if (_yychar == '*' || _yychar == '!') {
  353. const char ch = _yychar;
  354. yyinp();
  355. if (ch == '*' && _yychar == '/')
  356. goto done;
  357. if (_yychar == '<')
  358. yyinp();
  359. if (! _yychar || std::isspace(_yychar))
  360. commentKind = T_DOXY_COMMENT;
  361. }
  362. while (_yychar) {
  363. if (_yychar != '*') {
  364. yyinp();
  365. } else {
  366. yyinp();
  367. if (_yychar == '/')
  368. break;
  369. }
  370. }
  371. done:
  372. if (_yychar)
  373. yyinp();
  374. else
  375. s._tokenKind = commentKind;
  376. if (! f._scanCommentTokens)
  377. goto again;
  378. tok->f.kind = commentKind;
  379. } else if (_yychar == '=') {
  380. yyinp();
  381. tok->f.kind = T_SLASH_EQUAL;
  382. } else {
  383. tok->f.kind = T_SLASH;
  384. }
  385. break;
  386. case '%':
  387. if (_yychar == '=') {
  388. yyinp();
  389. tok->f.kind = T_PERCENT_EQUAL;
  390. } else if (_yychar == '>') {
  391. yyinp();
  392. tok->f.kind = T_RBRACE;
  393. } else if (_yychar == ':') {
  394. yyinp();
  395. tok->f.kind = T_POUND;
  396. } else {
  397. tok->f.kind = T_PERCENT;
  398. }
  399. break;
  400. case '^':
  401. if (_yychar == '=') {
  402. yyinp();
  403. tok->f.kind = T_CARET_EQUAL;
  404. } else {
  405. tok->f.kind = T_CARET;
  406. }
  407. break;
  408. case '&':
  409. if (_yychar == '&') {
  410. yyinp();
  411. tok->f.kind = T_AMPER_AMPER;
  412. } else if (_yychar == '=') {
  413. yyinp();
  414. tok->f.kind = T_AMPER_EQUAL;
  415. } else {
  416. tok->f.kind = T_AMPER;
  417. }
  418. break;
  419. case '|':
  420. if (_yychar == '|') {
  421. yyinp();
  422. tok->f.kind = T_PIPE_PIPE;
  423. } else if (_yychar == '=') {
  424. yyinp();
  425. tok->f.kind = T_PIPE_EQUAL;
  426. } else {
  427. tok->f.kind = T_PIPE;
  428. }
  429. break;
  430. case '~':
  431. if (_yychar == '=') {
  432. yyinp();
  433. tok->f.kind = T_TILDE_EQUAL;
  434. } else {
  435. tok->f.kind = T_TILDE;
  436. }
  437. break;
  438. case '!':
  439. if (_yychar == '=') {
  440. yyinp();
  441. tok->f.kind = T_EXCLAIM_EQUAL;
  442. } else {
  443. tok->f.kind = T_EXCLAIM;
  444. }
  445. break;
  446. case '=':
  447. if (_yychar == '=') {
  448. yyinp();
  449. tok->f.kind = T_EQUAL_EQUAL;
  450. } else {
  451. tok->f.kind = T_EQUAL;
  452. }
  453. break;
  454. case '<':
  455. if (f._scanAngleStringLiteralTokens) {
  456. const char *yytext = _currentChar;
  457. while (_yychar && _yychar != '>')
  458. yyinp();
  459. int yylen = _currentChar - yytext;
  460. // ### CPP_CHECK(_yychar == '>');
  461. if (_yychar == '>')
  462. yyinp();
  463. if (control())
  464. tok->string = control()->stringLiteral(yytext, yylen);
  465. tok->f.kind = T_ANGLE_STRING_LITERAL;
  466. } else if (_yychar == '<') {
  467. yyinp();
  468. if (_yychar == '=') {
  469. yyinp();
  470. tok->f.kind = T_LESS_LESS_EQUAL;
  471. } else
  472. tok->f.kind = T_LESS_LESS;
  473. } else if (_yychar == '=') {
  474. yyinp();
  475. tok->f.kind = T_LESS_EQUAL;
  476. } else if (_yychar == ':') {
  477. yyinp();
  478. tok->f.kind = T_LBRACKET;
  479. } else if (_yychar == '%') {
  480. yyinp();
  481. tok->f.kind = T_LBRACE;
  482. } else {
  483. tok->f.kind = T_LESS;
  484. }
  485. break;
  486. case '>':
  487. if (_yychar == '>') {
  488. yyinp();
  489. if (_yychar == '=') {
  490. yyinp();
  491. tok->f.kind = T_GREATER_GREATER_EQUAL;
  492. } else
  493. tok->f.kind = T_LESS_LESS;
  494. tok->f.kind = T_GREATER_GREATER;
  495. } else if (_yychar == '=') {
  496. yyinp();
  497. tok->f.kind = T_GREATER_EQUAL;
  498. } else {
  499. tok->f.kind = T_GREATER;
  500. }
  501. break;
  502. case ',':
  503. tok->f.kind = T_COMMA;
  504. break;
  505. default: {
  506. if (_languageFeatures.objCEnabled) {
  507. if (ch == '@' && _yychar >= 'a' && _yychar <= 'z') {
  508. const char *yytext = _currentChar;
  509. do {
  510. yyinp();
  511. if (! (isalnum(_yychar) || _yychar == '_' || _yychar == '$'))
  512. break;
  513. } while (_yychar);
  514. const int yylen = _currentChar - yytext;
  515. tok->f.kind = classifyObjCAtKeyword(yytext, yylen);
  516. break;
  517. } else if (ch == '@' && _yychar == '"') {
  518. yyinp();
  519. scanStringLiteral(tok, '"');
  520. break;
  521. }
  522. }
  523. if (ch == 'L' || ch == 'u' || ch == 'U' || ch == 'R') {
  524. // Either a literal or still an identifier.
  525. if (_yychar == '"') {
  526. yyinp();
  527. if (ch == 'R')
  528. scanRawStringLiteral(tok);
  529. else
  530. scanStringLiteral(tok, ch);
  531. } else if (_yychar == '\'') {
  532. yyinp();
  533. scanCharLiteral(tok, ch);
  534. } else if (ch != 'R' && _yychar == 'R') {
  535. yyinp();
  536. if (_yychar == '"') {
  537. yyinp();
  538. scanRawStringLiteral(tok, ch);
  539. } else {
  540. scanIdentifier(tok, 1);
  541. }
  542. } else if (ch == 'u' && _yychar == '8') {
  543. yyinp();
  544. if (_yychar == '"') {
  545. yyinp();
  546. scanStringLiteral(tok, '8');
  547. } else if (_yychar == '\'') {
  548. yyinp();
  549. scanCharLiteral(tok, '8');
  550. } else if (_yychar == 'R') {
  551. yyinp();
  552. if (_yychar == '"') {
  553. yyinp();
  554. scanRawStringLiteral(tok, '8');
  555. } else {
  556. scanIdentifier(tok, 2);
  557. }
  558. } else {
  559. scanIdentifier(tok, 1);
  560. }
  561. } else {
  562. scanIdentifier(tok);
  563. }
  564. } else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
  565. scanIdentifier(tok, _currentChar - _tokenStart - 1);
  566. } else if (std::isdigit(ch)) {
  567. scanNumericLiteral(tok);
  568. } else {
  569. tok->f.kind = T_ERROR;
  570. }
  571. break;
  572. } // default
  573. } // switch
  574. }
  575. void Lexer::scanStringLiteral(Token *tok, unsigned char hint)
  576. {
  577. if (hint == 'L')
  578. tok->f.kind = T_WIDE_STRING_LITERAL;
  579. else if (hint == 'U')
  580. tok->f.kind = T_UTF32_STRING_LITERAL;
  581. else if (hint == 'u')
  582. tok->f.kind = T_UTF16_STRING_LITERAL;
  583. else if (hint == '8')
  584. tok->f.kind = T_UTF8_STRING_LITERAL;
  585. else if (hint == '@')
  586. tok->f.kind = T_AT_STRING_LITERAL;
  587. else
  588. tok->f.kind = T_STRING_LITERAL;
  589. scanUntilQuote(tok, '"');
  590. }
  591. void Lexer::scanRawStringLiteral(Token *tok, unsigned char hint)
  592. {
  593. const char *yytext = _currentChar;
  594. int delimLength = -1;
  595. const char *closingDelimCandidate = 0;
  596. while (_yychar) {
  597. if (_yychar == '(' && delimLength == -1) {
  598. delimLength = _currentChar - yytext;
  599. yyinp();
  600. } else if (_yychar == ')') {
  601. yyinp();
  602. if (delimLength == -1)
  603. break;
  604. closingDelimCandidate = _currentChar;
  605. } else {
  606. if (delimLength == -1) {
  607. if (_yychar == '\\' || std::isspace(_yychar))
  608. break;
  609. yyinp();
  610. } else {
  611. if (!closingDelimCandidate) {
  612. yyinp();
  613. } else {
  614. if (_yychar == '"') {
  615. if (delimLength == _currentChar - closingDelimCandidate) {
  616. // Got a matching closing delimiter.
  617. break;
  618. }
  619. }
  620. // Make sure this continues to be a valid candidate.
  621. if (_yychar != *(yytext + (_currentChar - closingDelimCandidate)))
  622. closingDelimCandidate = 0;
  623. yyinp();
  624. }
  625. }
  626. }
  627. }
  628. int yylen = _currentChar - yytext;
  629. if (_yychar == '"')
  630. yyinp();
  631. if (control())
  632. tok->string = control()->stringLiteral(yytext, yylen);
  633. if (hint == 'L')
  634. tok->f.kind = T_RAW_WIDE_STRING_LITERAL;
  635. else if (hint == 'U')
  636. tok->f.kind = T_RAW_UTF32_STRING_LITERAL;
  637. else if (hint == 'u')
  638. tok->f.kind = T_RAW_UTF16_STRING_LITERAL;
  639. else if (hint == '8')
  640. tok->f.kind = T_RAW_UTF8_STRING_LITERAL;
  641. else
  642. tok->f.kind = T_RAW_STRING_LITERAL;
  643. }
  644. void Lexer::scanCharLiteral(Token *tok, unsigned char hint)
  645. {
  646. if (hint == 'L')
  647. tok->f.kind = T_WIDE_CHAR_LITERAL;
  648. else if (hint == 'U')
  649. tok->f.kind = T_UTF32_CHAR_LITERAL;
  650. else if (hint == 'u')
  651. tok->f.kind = T_UTF16_CHAR_LITERAL;
  652. else
  653. tok->f.kind = T_CHAR_LITERAL;
  654. scanUntilQuote(tok, '\'');
  655. }
  656. void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
  657. {
  658. CPP_CHECK(quote == '"' || quote == '\'');
  659. const char *yytext = _currentChar;
  660. while (_yychar
  661. && _yychar != quote
  662. && _yychar != '\n') {
  663. if (_yychar == '\\')
  664. scanBackslash((Kind)tok->f.kind);
  665. else
  666. yyinp();
  667. }
  668. int yylen = _currentChar - yytext;
  669. if (_yychar == quote)
  670. yyinp();
  671. if (control())
  672. {
  673. tok->string = control()->stringLiteral(yytext, yylen);
  674. if (quote == '"')
  675. ((StringLiteral *)tok->string)->_quotedString = true;
  676. }
  677. }
  678. void Lexer::scanNumericLiteral(Token *tok)
  679. {
  680. const char *yytext = _currentChar - 1;
  681. while (_yychar) {
  682. if (_yychar == 'e' || _yychar == 'E') {
  683. yyinp();
  684. if (_yychar == '-' || _yychar == '+') {
  685. yyinp();
  686. // ### CPP_CHECK(std::isdigit(_yychar));
  687. }
  688. } else if (std::isalnum(_yychar) || _yychar == '.') {
  689. yyinp();
  690. } else {
  691. break;
  692. }
  693. }
  694. int yylen = _currentChar - yytext;
  695. tok->f.kind = T_NUMERIC_LITERAL;
  696. if (control())
  697. tok->number = control()->numericLiteral(yytext, yylen);
  698. }
  699. void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
  700. {
  701. const char *yytext = _currentChar - 1 - extraProcessedChars;
  702. while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
  703. || isByteOfMultiByteCodePoint(_yychar)) {
  704. yyinp();
  705. }
  706. int yylen = _currentChar - yytext;
  707. if (f._scanKeywords)
  708. tok->f.kind = classify(yytext, yylen, _languageFeatures);
  709. else
  710. tok->f.kind = T_IDENTIFIER;
  711. if (tok->f.kind == T_IDENTIFIER) {
  712. tok->f.kind = classifyOperator(yytext, yylen);
  713. if (control())
  714. tok->identifier = control()->identifier(yytext, yylen);
  715. }
  716. }
  717. void Lexer::scanBackslash(Kind type)
  718. {
  719. yyinp(); // skip '\\'
  720. if (_yychar && !std::isspace(_yychar)) {
  721. yyinp();
  722. return;
  723. }
  724. while (_yychar != '\n' && std::isspace(_yychar))
  725. yyinp();
  726. if (!_yychar) {
  727. s._tokenKind = type;
  728. s._newlineExpected = true;
  729. return;
  730. }
  731. if (_yychar == '\n') {
  732. yyinp();
  733. while (_yychar != '\n' && std::isspace(_yychar))
  734. yyinp();
  735. if (!_yychar)
  736. s._tokenKind = type;
  737. }
  738. }
  739. void Lexer::scanCppComment(Kind type)
  740. {
  741. while (_yychar && _yychar != '\n') {
  742. if (_yychar == '\\')
  743. scanBackslash(type);
  744. else if (_yychar)
  745. yyinp();
  746. }
  747. }