Lexer.h 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. // Copyright (c) 2008 Roberto Raggi <[email protected]>
  2. //
  3. // Permission is hereby granted, free of charge, to any person obtaining a copy
  4. // of this software and associated documentation files (the "Software"), to deal
  5. // in the Software without restriction, including without limitation the rights
  6. // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. // copies of the Software, and to permit persons to whom the Software is
  8. // furnished to do so, subject to the following conditions:
  9. //
  10. // The above copyright notice and this permission notice shall be included in
  11. // all copies or substantial portions of the Software.
  12. //
  13. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. // THE SOFTWARE.
  20. #ifndef CPLUSPLUS_LEXER_H
  21. #define CPLUSPLUS_LEXER_H
  22. #include "CPlusPlusForwardDeclarations.h"
  23. #include "Token.h"
  24. namespace CPlusPlus {
  25. class CPLUSPLUS_EXPORT Lexer
  26. {
  27. Lexer(const Lexer &other);
  28. void operator =(const Lexer &other);
  29. public:
  30. Lexer(TranslationUnit *unit);
  31. Lexer(const char *firstChar, const char *lastChar);
  32. ~Lexer();
  33. Control *control() const { return _control; }
  34. TranslationUnit *translationUnit() const;
  35. void scan(Token *tok);
  36. inline void operator()(Token *tok)
  37. { scan(tok); }
  38. bool scanCommentTokens() const;
  39. void setScanCommentTokens(bool onoff);
  40. bool scanKeywords() const;
  41. void setScanKeywords(bool onoff);
  42. bool scanAngleStringLiteralTokens() const;
  43. void setScanAngleStringLiteralTokens(bool onoff);
  44. void setStartWithNewline(bool enabled);
  45. int state() const;
  46. void setState(int state);
  47. LanguageFeatures languageFeatures() const { return _languageFeatures; }
  48. void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
  49. public:
  50. static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
  51. unsigned &utf16charCounter)
  52. {
  53. ++utf16charCounter;
  54. // Process multi-byte UTF-8 code point (non-latin1)
  55. if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(yychar))) {
  56. unsigned trailingBytesCurrentCodePoint = 1;
  57. for (unsigned char c = yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
  58. ++trailingBytesCurrentCodePoint;
  59. // Code points >= 0x00010000 are represented by two UTF-16 code units
  60. if (trailingBytesCurrentCodePoint >= 3)
  61. ++utf16charCounter;
  62. yychar = *(currentSourceChar += trailingBytesCurrentCodePoint + 1);
  63. // Process single-byte UTF-8 code point (latin1)
  64. } else {
  65. yychar = *++currentSourceChar;
  66. }
  67. }
  68. private:
  69. void pushLineStartOffset();
  70. void scan_helper(Token *tok);
  71. void setSource(const char *firstChar, const char *lastChar);
  72. static int classify(const char *string, int length, LanguageFeatures features);
  73. static int classifyObjCAtKeyword(const char *s, int n);
  74. static int classifyOperator(const char *string, int length);
  75. void scanStringLiteral(Token *tok, unsigned char hint = 0);
  76. void scanRawStringLiteral(Token *tok, unsigned char hint = 0);
  77. void scanCharLiteral(Token *tok, unsigned char hint = 0);
  78. void scanUntilQuote(Token *tok, unsigned char quote);
  79. void scanNumericLiteral(Token *tok);
  80. void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0);
  81. void scanBackslash(Kind type);
  82. void scanCppComment(Kind type);
  83. static bool isByteOfMultiByteCodePoint(unsigned char byte)
  84. { return byte & 0x80; } // Check if most significant bit is set
  85. void yyinp()
  86. {
  87. yyinp_utf8(_currentChar, _yychar, _currentCharUtf16);
  88. if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
  89. pushLineStartOffset();
  90. }
  91. private:
  92. struct Flags {
  93. unsigned _scanCommentTokens: 1;
  94. unsigned _scanKeywords: 1;
  95. unsigned _scanAngleStringLiteralTokens: 1;
  96. };
  97. struct State {
  98. unsigned char _tokenKind : 7;
  99. unsigned char _newlineExpected : 1;
  100. };
  101. TranslationUnit *_translationUnit;
  102. Control *_control;
  103. const char *_firstChar;
  104. const char *_currentChar;
  105. const char *_lastChar;
  106. const char *_tokenStart;
  107. unsigned char _yychar;
  108. unsigned _currentCharUtf16;
  109. unsigned _tokenStartUtf16;
  110. union {
  111. unsigned char _state;
  112. State s;
  113. };
  114. union {
  115. unsigned _flags;
  116. Flags f;
  117. };
  118. unsigned _currentLine;
  119. LanguageFeatures _languageFeatures;
  120. };
  121. } // namespace CPlusPlus
  122. #endif // CPLUSPLUS_LEXER_H