Lexer.cpp 129 KB


  1. //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Lexer and Token interfaces.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "clang/Lex/Lexer.h"
  14. #include "UnicodeCharSets.h"
  15. #include "clang/Basic/CharInfo.h"
  16. #include "clang/Basic/SourceManager.h"
  17. #include "clang/Lex/CodeCompletionHandler.h"
  18. #include "clang/Lex/LexDiagnostic.h"
  19. #include "clang/Lex/LiteralSupport.h"
  20. #include "clang/Lex/Preprocessor.h"
  21. #include "llvm/ADT/STLExtras.h"
  22. #include "llvm/ADT/StringExtras.h"
  23. #include "llvm/ADT/StringSwitch.h"
  24. #include "llvm/Support/Compiler.h"
  25. #include "llvm/Support/ConvertUTF.h"
  26. #include "llvm/Support/MemoryBuffer.h"
  27. #include <cstring>
  28. using namespace clang;
  29. //===----------------------------------------------------------------------===//
  30. // Token Class Implementation
  31. //===----------------------------------------------------------------------===//
  32. /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
  33. bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
  34. if (IdentifierInfo *II = getIdentifierInfo())
  35. return II->getObjCKeywordID() == objcKey;
  36. return false;
  37. }
  38. /// getObjCKeywordID - Return the ObjC keyword kind.
  39. tok::ObjCKeywordKind Token::getObjCKeywordID() const {
  40. IdentifierInfo *specId = getIdentifierInfo();
  41. return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
  42. }
  43. //===----------------------------------------------------------------------===//
  44. // Lexer Class Implementation
  45. //===----------------------------------------------------------------------===//
  46. void Lexer::anchor() { }
  47. void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
  48. const char *BufEnd) {
  49. BufferStart = BufStart;
  50. BufferPtr = BufPtr;
  51. BufferEnd = BufEnd;
  52. assert(BufEnd[0] == 0 &&
  53. "We assume that the input buffer has a null character at the end"
  54. " to simplify lexing!");
  55. // Check whether we have a BOM in the beginning of the buffer. If yes - act
  56. // accordingly. Right now we support only UTF-8 with and without BOM, so, just
  57. // skip the UTF-8 BOM if it's present.
  58. if (BufferStart == BufferPtr) {
  59. // Determine the size of the BOM.
  60. StringRef Buf(BufferStart, BufferEnd - BufferStart);
  61. size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
  62. .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
  63. .Default(0);
  64. // Skip the BOM.
  65. BufferPtr += BOMLength;
  66. }
  67. Is_PragmaLexer = false;
  68. CurrentConflictMarkerState = CMK_None;
  69. // Start of the file is a start of line.
  70. IsAtStartOfLine = true;
  71. IsAtPhysicalStartOfLine = true;
  72. HasLeadingSpace = false;
  73. HasLeadingEmptyMacro = false;
  74. // We are not after parsing a #.
  75. ParsingPreprocessorDirective = false;
  76. // We are not after parsing #include.
  77. ParsingFilename = false;
  78. // We are not in raw mode. Raw mode disables diagnostics and interpretation
  79. // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
  80. // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
  81. // or otherwise skipping over tokens.
  82. LexingRawMode = false;
  83. // Default to not keeping comments.
  84. ExtendedTokenMode = 0;
  85. }
  86. /// Lexer constructor - Create a new lexer object for the specified buffer
  87. /// with the specified preprocessor managing the lexing process. This lexer
  88. /// assumes that the associated file buffer and Preprocessor objects will
  89. /// outlive it, so it doesn't take ownership of either of them.
  90. Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
  91. : PreprocessorLexer(&PP, FID),
  92. FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
  93. LangOpts(PP.getLangOpts()) {
  94. InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
  95. InputFile->getBufferEnd());
  96. resetExtendedTokenMode();
  97. }
  98. void Lexer::resetExtendedTokenMode() {
  99. assert(PP && "Cannot reset token mode without a preprocessor");
  100. if (LangOpts.TraditionalCPP)
  101. SetKeepWhitespaceMode(true);
  102. else
  103. SetCommentRetentionState(PP->getCommentRetentionState());
  104. }
  105. /// Lexer constructor - Create a new raw lexer object. This object is only
  106. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
  107. /// range will outlive it, so it doesn't take ownership of it.
  108. Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
  109. const char *BufStart, const char *BufPtr, const char *BufEnd)
  110. : FileLoc(fileloc), LangOpts(langOpts) {
  111. InitLexer(BufStart, BufPtr, BufEnd);
  112. // We *are* in raw mode.
  113. LexingRawMode = true;
  114. }
  115. /// Lexer constructor - Create a new raw lexer object. This object is only
  116. /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
  117. /// range will outlive it, so it doesn't take ownership of it.
  118. Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
  119. const SourceManager &SM, const LangOptions &langOpts)
  120. : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
  121. FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
  122. /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
  123. /// _Pragma expansion. This has a variety of magic semantics that this method
  124. /// sets up. It returns a new'd Lexer that must be delete'd when done.
  125. ///
  126. /// On entrance to this routine, TokStartLoc is a macro location which has a
  127. /// spelling loc that indicates the bytes to be lexed for the token and an
  128. /// expansion location that indicates where all lexed tokens should be
  129. /// "expanded from".
  130. ///
  131. /// TODO: It would really be nice to make _Pragma just be a wrapper around a
  132. /// normal lexer that remaps tokens as they fly by. This would require making
  133. /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
  134. /// interface that could handle this stuff. This would pull GetMappedTokenLoc
  135. /// out of the critical path of the lexer!
  136. ///
  137. Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
  138. SourceLocation ExpansionLocStart,
  139. SourceLocation ExpansionLocEnd,
  140. unsigned TokLen, Preprocessor &PP) {
  141. SourceManager &SM = PP.getSourceManager();
  142. // Create the lexer as if we were going to lex the file normally.
  143. FileID SpellingFID = SM.getFileID(SpellingLoc);
  144. const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
  145. Lexer *L = new Lexer(SpellingFID, InputFile, PP);
  146. // Now that the lexer is created, change the start/end locations so that we
  147. // just lex the subsection of the file that we want. This is lexing from a
  148. // scratch buffer.
  149. const char *StrData = SM.getCharacterData(SpellingLoc);
  150. L->BufferPtr = StrData;
  151. L->BufferEnd = StrData+TokLen;
  152. assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
  153. // Set the SourceLocation with the remapping information. This ensures that
  154. // GetMappedTokenLoc will remap the tokens as they are lexed.
  155. L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
  156. ExpansionLocStart,
  157. ExpansionLocEnd, TokLen);
  158. // Ensure that the lexer thinks it is inside a directive, so that end \n will
  159. // return an EOD token.
  160. L->ParsingPreprocessorDirective = true;
  161. // This lexer really is for _Pragma.
  162. L->Is_PragmaLexer = true;
  163. return L;
  164. }
  165. /// Stringify - Convert the specified string into a C string, with surrounding
  166. /// ""'s, and with escaped \ and " characters.
  167. std::string Lexer::Stringify(StringRef Str, bool Charify) {
  168. std::string Result = Str;
  169. char Quote = Charify ? '\'' : '"';
  170. for (unsigned i = 0, e = Result.size(); i != e; ++i) {
  171. if (Result[i] == '\\' || Result[i] == Quote) {
  172. Result.insert(Result.begin()+i, '\\');
  173. ++i; ++e;
  174. }
  175. }
  176. return Result;
  177. }
  178. /// Stringify - Convert the specified string into a C string by escaping '\'
  179. /// and " characters. This does not add surrounding ""'s to the string.
  180. void Lexer::Stringify(SmallVectorImpl<char> &Str) {
  181. for (unsigned i = 0, e = Str.size(); i != e; ++i) {
  182. if (Str[i] == '\\' || Str[i] == '"') {
  183. Str.insert(Str.begin()+i, '\\');
  184. ++i; ++e;
  185. }
  186. }
  187. }
  188. //===----------------------------------------------------------------------===//
  189. // Token Spelling
  190. //===----------------------------------------------------------------------===//
  191. /// \brief Slow case of getSpelling. Extract the characters comprising the
  192. /// spelling of this token from the provided input buffer.
  193. static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
  194. const LangOptions &LangOpts, _Out_cap_x_(Tok.getLength()) char *Spelling) {
  195. assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
  196. size_t Length = 0;
  197. const char *BufEnd = BufPtr + Tok.getLength();
  198. if (Tok.is(tok::string_literal)) {
  199. // Munch the encoding-prefix and opening double-quote.
  200. while (BufPtr < BufEnd) {
  201. unsigned Size;
  202. Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
  203. BufPtr += Size;
  204. if (Spelling[Length - 1] == '"')
  205. break;
  206. }
  207. // Raw string literals need special handling; trigraph expansion and line
  208. // splicing do not occur within their d-char-sequence nor within their
  209. // r-char-sequence.
  210. if (Length >= 2 &&
  211. Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
  212. // Search backwards from the end of the token to find the matching closing
  213. // quote.
  214. const char *RawEnd = BufEnd;
  215. do --RawEnd; while (*RawEnd != '"');
  216. size_t RawLength = RawEnd - BufPtr + 1;
  217. // Everything between the quotes is included verbatim in the spelling.
  218. memcpy(Spelling + Length, BufPtr, RawLength);
  219. Length += RawLength;
  220. BufPtr += RawLength;
  221. // The rest of the token is lexed normally.
  222. }
  223. }
  224. while (BufPtr < BufEnd) {
  225. unsigned Size;
  226. Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
  227. BufPtr += Size;
  228. }
  229. assert(Length < Tok.getLength() &&
  230. "NeedsCleaning flag set on token that didn't need cleaning!");
  231. return Length;
  232. }
  233. /// getSpelling() - Return the 'spelling' of this token. The spelling of a
  234. /// token are the characters used to represent the token in the source file
  235. /// after trigraph expansion and escaped-newline folding. In particular, this
  236. /// wants to get the true, uncanonicalized, spelling of things like digraphs
  237. /// UCNs, etc.
  238. StringRef Lexer::getSpelling(SourceLocation loc,
  239. SmallVectorImpl<char> &buffer,
  240. const SourceManager &SM,
  241. const LangOptions &options,
  242. bool *invalid) {
  243. // Break down the source location.
  244. std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
  245. // Try to the load the file buffer.
  246. bool invalidTemp = false;
  247. StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
  248. if (invalidTemp) {
  249. if (invalid) *invalid = true;
  250. return StringRef();
  251. }
  252. const char *tokenBegin = file.data() + locInfo.second;
  253. // Lex from the start of the given location.
  254. Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
  255. file.begin(), tokenBegin, file.end());
  256. Token token;
  257. lexer.LexFromRawLexer(token);
  258. unsigned length = token.getLength();
  259. // Common case: no need for cleaning.
  260. if (!token.needsCleaning())
  261. return StringRef(tokenBegin, length);
  262. // Hard case, we need to relex the characters into the string.
  263. buffer.resize(length);
  264. buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
  265. return StringRef(buffer.data(), buffer.size());
  266. }
  267. /// getSpelling() - Return the 'spelling' of this token. The spelling of a
  268. /// token are the characters used to represent the token in the source file
  269. /// after trigraph expansion and escaped-newline folding. In particular, this
  270. /// wants to get the true, uncanonicalized, spelling of things like digraphs
  271. /// UCNs, etc.
  272. std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
  273. const LangOptions &LangOpts, bool *Invalid) {
  274. assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
  275. bool CharDataInvalid = false;
  276. const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
  277. &CharDataInvalid);
  278. if (Invalid)
  279. *Invalid = CharDataInvalid;
  280. if (CharDataInvalid)
  281. return std::string();
  282. // If this token contains nothing interesting, return it directly.
  283. if (!Tok.needsCleaning())
  284. return std::string(TokStart, TokStart + Tok.getLength());
  285. std::string Result;
  286. Result.resize(Tok.getLength());
  287. Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
  288. return Result;
  289. }
  290. /// getSpelling - This method is used to get the spelling of a token into a
  291. /// preallocated buffer, instead of as an std::string. The caller is required
  292. /// to allocate enough space for the token, which is guaranteed to be at least
  293. /// Tok.getLength() bytes long. The actual length of the token is returned.
  294. ///
  295. /// Note that this method may do two possible things: it may either fill in
  296. /// the buffer specified with characters, or it may *change the input pointer*
  297. /// to point to a constant buffer with the data already in it (avoiding a
  298. /// copy). The caller is not allowed to modify the returned buffer pointer
  299. /// if an internal buffer is returned.
  300. unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
  301. const SourceManager &SourceMgr,
  302. const LangOptions &LangOpts, bool *Invalid) {
  303. assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
  304. const char *TokStart = nullptr;
  305. // NOTE: this has to be checked *before* testing for an IdentifierInfo.
  306. if (Tok.is(tok::raw_identifier))
  307. TokStart = Tok.getRawIdentifier().data();
  308. else if (!Tok.hasUCN()) {
  309. if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
  310. // Just return the string from the identifier table, which is very quick.
  311. Buffer = II->getNameStart();
  312. return II->getLength();
  313. }
  314. }
  315. // NOTE: this can be checked even after testing for an IdentifierInfo.
  316. if (Tok.isLiteral())
  317. TokStart = Tok.getLiteralData();
  318. if (!TokStart) {
  319. // Compute the start of the token in the input lexer buffer.
  320. bool CharDataInvalid = false;
  321. TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
  322. if (Invalid)
  323. *Invalid = CharDataInvalid;
  324. if (CharDataInvalid) {
  325. Buffer = "";
  326. return 0;
  327. }
  328. }
  329. // If this token contains nothing interesting, return it directly.
  330. if (!Tok.needsCleaning()) {
  331. Buffer = TokStart;
  332. return Tok.getLength();
  333. }
  334. // Otherwise, hard case, relex the characters into the string.
  335. return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
  336. }
  337. /// MeasureTokenLength - Relex the token at the specified location and return
  338. /// its length in bytes in the input file. If the token needs cleaning (e.g.
  339. /// includes a trigraph or an escaped newline) then this count includes bytes
  340. /// that are part of that.
  341. unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
  342. const SourceManager &SM,
  343. const LangOptions &LangOpts) {
  344. Token TheTok;
  345. if (getRawToken(Loc, TheTok, SM, LangOpts))
  346. return 0;
  347. return TheTok.getLength();
  348. }
  349. /// \brief Relex the token at the specified location.
  350. /// \returns true if there was a failure, false on success.
  351. bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
  352. const SourceManager &SM,
  353. const LangOptions &LangOpts,
  354. bool IgnoreWhiteSpace) {
  355. // TODO: this could be special cased for common tokens like identifiers, ')',
  356. // etc to make this faster, if it mattered. Just look at StrData[0] to handle
  357. // all obviously single-char tokens. This could use
  358. // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
  359. // something.
  360. // If this comes from a macro expansion, we really do want the macro name, not
  361. // the token this macro expanded to.
  362. Loc = SM.getExpansionLoc(Loc);
  363. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  364. bool Invalid = false;
  365. StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
  366. if (Invalid)
  367. return true;
  368. const char *StrData = Buffer.data()+LocInfo.second;
  369. if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
  370. return true;
  371. // Create a lexer starting at the beginning of this token.
  372. Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
  373. Buffer.begin(), StrData, Buffer.end());
  374. TheLexer.SetCommentRetentionState(true);
  375. TheLexer.LexFromRawLexer(Result);
  376. return false;
  377. }
  378. static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
  379. const SourceManager &SM,
  380. const LangOptions &LangOpts) {
  381. assert(Loc.isFileID());
  382. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  383. if (LocInfo.first.isInvalid())
  384. return Loc;
  385. bool Invalid = false;
  386. StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
  387. if (Invalid)
  388. return Loc;
  389. // Back up from the current location until we hit the beginning of a line
  390. // (or the buffer). We'll relex from that point.
  391. const char *BufStart = Buffer.data();
  392. if (LocInfo.second >= Buffer.size())
  393. return Loc;
  394. const char *StrData = BufStart+LocInfo.second;
  395. if (StrData[0] == '\n' || StrData[0] == '\r')
  396. return Loc;
  397. const char *LexStart = StrData;
  398. while (LexStart != BufStart) {
  399. if (LexStart[0] == '\n' || LexStart[0] == '\r') {
  400. ++LexStart;
  401. break;
  402. }
  403. --LexStart;
  404. }
  405. // Create a lexer starting at the beginning of this token.
  406. SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
  407. Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
  408. TheLexer.SetCommentRetentionState(true);
  409. // Lex tokens until we find the token that contains the source location.
  410. Token TheTok;
  411. do {
  412. TheLexer.LexFromRawLexer(TheTok);
  413. if (TheLexer.getBufferLocation() > StrData) {
  414. // Lexing this token has taken the lexer past the source location we're
  415. // looking for. If the current token encompasses our source location,
  416. // return the beginning of that token.
  417. if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
  418. return TheTok.getLocation();
  419. // We ended up skipping over the source location entirely, which means
  420. // that it points into whitespace. We're done here.
  421. break;
  422. }
  423. } while (TheTok.getKind() != tok::eof);
  424. // We've passed our source location; just return the original source location.
  425. return Loc;
  426. }
  427. SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
  428. const SourceManager &SM,
  429. const LangOptions &LangOpts) {
  430. if (Loc.isFileID())
  431. return getBeginningOfFileToken(Loc, SM, LangOpts);
  432. if (!SM.isMacroArgExpansion(Loc))
  433. return Loc;
  434. SourceLocation FileLoc = SM.getSpellingLoc(Loc);
  435. SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
  436. std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
  437. std::pair<FileID, unsigned> BeginFileLocInfo
  438. = SM.getDecomposedLoc(BeginFileLoc);
  439. assert(FileLocInfo.first == BeginFileLocInfo.first &&
  440. FileLocInfo.second >= BeginFileLocInfo.second);
  441. return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
  442. }
  443. namespace {
  444. enum PreambleDirectiveKind {
  445. PDK_Skipped,
  446. PDK_StartIf,
  447. PDK_EndIf,
  448. PDK_Unknown
  449. };
  450. }
  451. std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
  452. const LangOptions &LangOpts,
  453. unsigned MaxLines) {
  454. // Create a lexer starting at the beginning of the file. Note that we use a
  455. // "fake" file source location at offset 1 so that the lexer will track our
  456. // position within the file.
  457. const unsigned StartOffset = 1;
  458. SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
  459. Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
  460. Buffer.end());
  461. TheLexer.SetCommentRetentionState(true);
  462. // StartLoc will differ from FileLoc if there is a BOM that was skipped.
  463. SourceLocation StartLoc = TheLexer.getSourceLocation();
  464. bool InPreprocessorDirective = false;
  465. Token TheTok;
  466. Token IfStartTok;
  467. unsigned IfCount = 0;
  468. SourceLocation ActiveCommentLoc;
  469. unsigned MaxLineOffset = 0;
  470. if (MaxLines) {
  471. const char *CurPtr = Buffer.begin();
  472. unsigned CurLine = 0;
  473. while (CurPtr != Buffer.end()) {
  474. char ch = *CurPtr++;
  475. if (ch == '\n') {
  476. ++CurLine;
  477. if (CurLine == MaxLines)
  478. break;
  479. }
  480. }
  481. if (CurPtr != Buffer.end())
  482. MaxLineOffset = CurPtr - Buffer.begin();
  483. }
  484. do {
  485. TheLexer.LexFromRawLexer(TheTok);
  486. if (InPreprocessorDirective) {
  487. // If we've hit the end of the file, we're done.
  488. if (TheTok.getKind() == tok::eof) {
  489. break;
  490. }
  491. // If we haven't hit the end of the preprocessor directive, skip this
  492. // token.
  493. if (!TheTok.isAtStartOfLine())
  494. continue;
  495. // We've passed the end of the preprocessor directive, and will look
  496. // at this token again below.
  497. InPreprocessorDirective = false;
  498. }
  499. // Keep track of the # of lines in the preamble.
  500. if (TheTok.isAtStartOfLine()) {
  501. unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
  502. // If we were asked to limit the number of lines in the preamble,
  503. // and we're about to exceed that limit, we're done.
  504. if (MaxLineOffset && TokOffset >= MaxLineOffset)
  505. break;
  506. }
  507. // Comments are okay; skip over them.
  508. if (TheTok.getKind() == tok::comment) {
  509. if (ActiveCommentLoc.isInvalid())
  510. ActiveCommentLoc = TheTok.getLocation();
  511. continue;
  512. }
  513. if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
  514. // This is the start of a preprocessor directive.
  515. Token HashTok = TheTok;
  516. InPreprocessorDirective = true;
  517. ActiveCommentLoc = SourceLocation();
  518. // Figure out which directive this is. Since we're lexing raw tokens,
  519. // we don't have an identifier table available. Instead, just look at
  520. // the raw identifier to recognize and categorize preprocessor directives.
  521. TheLexer.LexFromRawLexer(TheTok);
  522. if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
  523. StringRef Keyword = TheTok.getRawIdentifier();
  524. PreambleDirectiveKind PDK
  525. = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
  526. .Case("include", PDK_Skipped)
  527. .Case("__include_macros", PDK_Skipped)
  528. .Case("define", PDK_Skipped)
  529. .Case("undef", PDK_Skipped)
  530. .Case("line", PDK_Skipped)
  531. .Case("error", PDK_Skipped)
  532. .Case("pragma", PDK_Skipped)
  533. .Case("import", PDK_Skipped)
  534. .Case("include_next", PDK_Skipped)
  535. .Case("warning", PDK_Skipped)
  536. .Case("ident", PDK_Skipped)
  537. .Case("sccs", PDK_Skipped)
  538. .Case("assert", PDK_Skipped)
  539. .Case("unassert", PDK_Skipped)
  540. .Case("if", PDK_StartIf)
  541. .Case("ifdef", PDK_StartIf)
  542. .Case("ifndef", PDK_StartIf)
  543. .Case("elif", PDK_Skipped)
  544. .Case("else", PDK_Skipped)
  545. .Case("endif", PDK_EndIf)
  546. .Default(PDK_Unknown);
  547. switch (PDK) {
  548. case PDK_Skipped:
  549. continue;
  550. case PDK_StartIf:
  551. if (IfCount == 0)
  552. IfStartTok = HashTok;
  553. ++IfCount;
  554. continue;
  555. case PDK_EndIf:
  556. // Mismatched #endif. The preamble ends here.
  557. if (IfCount == 0)
  558. break;
  559. --IfCount;
  560. continue;
  561. case PDK_Unknown:
  562. // We don't know what this directive is; stop at the '#'.
  563. break;
  564. }
  565. }
  566. // We only end up here if we didn't recognize the preprocessor
  567. // directive or it was one that can't occur in the preamble at this
  568. // point. Roll back the current token to the location of the '#'.
  569. InPreprocessorDirective = false;
  570. TheTok = HashTok;
  571. }
  572. // We hit a token that we don't recognize as being in the
  573. // "preprocessing only" part of the file, so we're no longer in
  574. // the preamble.
  575. break;
  576. } while (true);
  577. SourceLocation End;
  578. if (IfCount)
  579. End = IfStartTok.getLocation();
  580. else if (ActiveCommentLoc.isValid())
  581. End = ActiveCommentLoc; // don't truncate a decl comment.
  582. else
  583. End = TheTok.getLocation();
  584. return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
  585. IfCount? IfStartTok.isAtStartOfLine()
  586. : TheTok.isAtStartOfLine());
  587. }
  588. /// AdvanceToTokenCharacter - Given a location that specifies the start of a
  589. /// token, return a new location that specifies a character within the token.
  590. SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
  591. unsigned CharNo,
  592. const SourceManager &SM,
  593. const LangOptions &LangOpts) {
  594. // Figure out how many physical characters away the specified expansion
  595. // character is. This needs to take into consideration newlines and
  596. // trigraphs.
  597. bool Invalid = false;
  598. const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
  599. // If they request the first char of the token, we're trivially done.
  600. if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
  601. return TokStart;
  602. unsigned PhysOffset = 0;
  603. // The usual case is that tokens don't contain anything interesting. Skip
  604. // over the uninteresting characters. If a token only consists of simple
  605. // chars, this method is extremely fast.
  606. while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
  607. if (CharNo == 0)
  608. return TokStart.getLocWithOffset(PhysOffset);
  609. ++TokPtr, --CharNo, ++PhysOffset;
  610. }
  611. // If we have a character that may be a trigraph or escaped newline, use a
  612. // lexer to parse it correctly.
  613. for (; CharNo; --CharNo) {
  614. unsigned Size;
  615. Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
  616. TokPtr += Size;
  617. PhysOffset += Size;
  618. }
  619. // Final detail: if we end up on an escaped newline, we want to return the
  620. // location of the actual byte of the token. For example foo\<newline>bar
  621. // advanced by 3 should return the location of b, not of \\. One compounding
  622. // detail of this is that the escape may be made by a trigraph.
  623. if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
  624. PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
  625. return TokStart.getLocWithOffset(PhysOffset);
  626. }
  627. /// \brief Computes the source location just past the end of the
  628. /// token at this source location.
  629. ///
  630. /// This routine can be used to produce a source location that
  631. /// points just past the end of the token referenced by \p Loc, and
  632. /// is generally used when a diagnostic needs to point just after a
  633. /// token where it expected something different that it received. If
  634. /// the returned source location would not be meaningful (e.g., if
  635. /// it points into a macro), this routine returns an invalid
  636. /// source location.
  637. ///
  638. /// \param Offset an offset from the end of the token, where the source
  639. /// location should refer to. The default offset (0) produces a source
  640. /// location pointing just past the end of the token; an offset of 1 produces
  641. /// a source location pointing to the last character in the token, etc.
  642. SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
  643. const SourceManager &SM,
  644. const LangOptions &LangOpts) {
  645. if (Loc.isInvalid())
  646. return SourceLocation();
  647. if (Loc.isMacroID()) {
  648. if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
  649. return SourceLocation(); // Points inside the macro expansion.
  650. }
  651. unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
  652. if (Len > Offset)
  653. Len = Len - Offset;
  654. else
  655. return Loc;
  656. return Loc.getLocWithOffset(Len);
  657. }
  658. /// \brief Returns true if the given MacroID location points at the first
  659. /// token of the macro expansion.
  660. bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
  661. const SourceManager &SM,
  662. const LangOptions &LangOpts,
  663. SourceLocation *MacroBegin) {
  664. assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
  665. SourceLocation expansionLoc;
  666. if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
  667. return false;
  668. if (expansionLoc.isFileID()) {
  669. // No other macro expansions, this is the first.
  670. if (MacroBegin)
  671. *MacroBegin = expansionLoc;
  672. return true;
  673. }
  674. return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
  675. }
  676. /// \brief Returns true if the given MacroID location points at the last
  677. /// token of the macro expansion.
  678. bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
  679. const SourceManager &SM,
  680. const LangOptions &LangOpts,
  681. SourceLocation *MacroEnd) {
  682. assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
  683. SourceLocation spellLoc = SM.getSpellingLoc(loc);
  684. unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
  685. if (tokLen == 0)
  686. return false;
  687. SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
  688. SourceLocation expansionLoc;
  689. if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
  690. return false;
  691. if (expansionLoc.isFileID()) {
  692. // No other macro expansions.
  693. if (MacroEnd)
  694. *MacroEnd = expansionLoc;
  695. return true;
  696. }
  697. return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
  698. }
  699. static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
  700. const SourceManager &SM,
  701. const LangOptions &LangOpts) {
  702. SourceLocation Begin = Range.getBegin();
  703. SourceLocation End = Range.getEnd();
  704. assert(Begin.isFileID() && End.isFileID());
  705. if (Range.isTokenRange()) {
  706. End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
  707. if (End.isInvalid())
  708. return CharSourceRange();
  709. }
  710. // Break down the source locations.
  711. FileID FID;
  712. unsigned BeginOffs;
  713. std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
  714. if (FID.isInvalid())
  715. return CharSourceRange();
  716. unsigned EndOffs;
  717. if (!SM.isInFileID(End, FID, &EndOffs) ||
  718. BeginOffs > EndOffs)
  719. return CharSourceRange();
  720. return CharSourceRange::getCharRange(Begin, End);
  721. }
  722. CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
  723. const SourceManager &SM,
  724. const LangOptions &LangOpts) {
  725. SourceLocation Begin = Range.getBegin();
  726. SourceLocation End = Range.getEnd();
  727. if (Begin.isInvalid() || End.isInvalid())
  728. return CharSourceRange();
  729. if (Begin.isFileID() && End.isFileID())
  730. return makeRangeFromFileLocs(Range, SM, LangOpts);
  731. if (Begin.isMacroID() && End.isFileID()) {
  732. if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
  733. return CharSourceRange();
  734. Range.setBegin(Begin);
  735. return makeRangeFromFileLocs(Range, SM, LangOpts);
  736. }
  737. if (Begin.isFileID() && End.isMacroID()) {
  738. if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
  739. &End)) ||
  740. (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
  741. &End)))
  742. return CharSourceRange();
  743. Range.setEnd(End);
  744. return makeRangeFromFileLocs(Range, SM, LangOpts);
  745. }
  746. assert(Begin.isMacroID() && End.isMacroID());
  747. SourceLocation MacroBegin, MacroEnd;
  748. if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
  749. ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
  750. &MacroEnd)) ||
  751. (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
  752. &MacroEnd)))) {
  753. Range.setBegin(MacroBegin);
  754. Range.setEnd(MacroEnd);
  755. return makeRangeFromFileLocs(Range, SM, LangOpts);
  756. }
  757. bool Invalid = false;
  758. const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
  759. &Invalid);
  760. if (Invalid)
  761. return CharSourceRange();
  762. if (BeginEntry.getExpansion().isMacroArgExpansion()) {
  763. const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
  764. &Invalid);
  765. if (Invalid)
  766. return CharSourceRange();
  767. if (EndEntry.getExpansion().isMacroArgExpansion() &&
  768. BeginEntry.getExpansion().getExpansionLocStart() ==
  769. EndEntry.getExpansion().getExpansionLocStart()) {
  770. Range.setBegin(SM.getImmediateSpellingLoc(Begin));
  771. Range.setEnd(SM.getImmediateSpellingLoc(End));
  772. return makeFileCharRange(Range, SM, LangOpts);
  773. }
  774. }
  775. return CharSourceRange();
  776. }
  777. StringRef Lexer::getSourceText(CharSourceRange Range,
  778. const SourceManager &SM,
  779. const LangOptions &LangOpts,
  780. bool *Invalid) {
  781. Range = makeFileCharRange(Range, SM, LangOpts);
  782. if (Range.isInvalid()) {
  783. if (Invalid) *Invalid = true;
  784. return StringRef();
  785. }
  786. // Break down the source location.
  787. std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
  788. if (beginInfo.first.isInvalid()) {
  789. if (Invalid) *Invalid = true;
  790. return StringRef();
  791. }
  792. unsigned EndOffs;
  793. if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
  794. beginInfo.second > EndOffs) {
  795. if (Invalid) *Invalid = true;
  796. return StringRef();
  797. }
  798. // Try to the load the file buffer.
  799. bool invalidTemp = false;
  800. StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
  801. if (invalidTemp) {
  802. if (Invalid) *Invalid = true;
  803. return StringRef();
  804. }
  805. if (Invalid) *Invalid = false;
  806. return file.substr(beginInfo.second, EndOffs - beginInfo.second);
  807. }
  808. StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
  809. const SourceManager &SM,
  810. const LangOptions &LangOpts) {
  811. assert(Loc.isMacroID() && "Only reasonble to call this on macros");
  812. // Find the location of the immediate macro expansion.
  813. while (1) {
  814. FileID FID = SM.getFileID(Loc);
  815. const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
  816. const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
  817. Loc = Expansion.getExpansionLocStart();
  818. if (!Expansion.isMacroArgExpansion())
  819. break;
  820. // For macro arguments we need to check that the argument did not come
  821. // from an inner macro, e.g: "MAC1( MAC2(foo) )"
  822. // Loc points to the argument id of the macro definition, move to the
  823. // macro expansion.
  824. Loc = SM.getImmediateExpansionRange(Loc).first;
  825. SourceLocation SpellLoc = Expansion.getSpellingLoc();
  826. if (SpellLoc.isFileID())
  827. break; // No inner macro.
  828. // If spelling location resides in the same FileID as macro expansion
  829. // location, it means there is no inner macro.
  830. FileID MacroFID = SM.getFileID(Loc);
  831. if (SM.isInFileID(SpellLoc, MacroFID))
  832. break;
  833. // Argument came from inner macro.
  834. Loc = SpellLoc;
  835. }
  836. // Find the spelling location of the start of the non-argument expansion
  837. // range. This is where the macro name was spelled in order to begin
  838. // expanding this macro.
  839. Loc = SM.getSpellingLoc(Loc);
  840. // Dig out the buffer where the macro name was spelled and the extents of the
  841. // name so that we can render it into the expansion note.
  842. std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
  843. unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
  844. StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
  845. return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
  846. }
  847. bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
  848. return isIdentifierBody(c, LangOpts.DollarIdents);
  849. }
  850. //===----------------------------------------------------------------------===//
  851. // Diagnostics forwarding code.
  852. //===----------------------------------------------------------------------===//
  853. /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
  854. /// lexer buffer was all expanded at a single point, perform the mapping.
  855. /// This is currently only used for _Pragma implementation, so it is the slow
  856. /// path of the hot getSourceLocation method. Do not allow it to be inlined.
  857. static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
  858. Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
  859. static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
  860. SourceLocation FileLoc,
  861. unsigned CharNo, unsigned TokLen) {
  862. assert(FileLoc.isMacroID() && "Must be a macro expansion");
  863. // Otherwise, we're lexing "mapped tokens". This is used for things like
  864. // _Pragma handling. Combine the expansion location of FileLoc with the
  865. // spelling location.
  866. SourceManager &SM = PP.getSourceManager();
  867. // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
  868. // characters come from spelling(FileLoc)+Offset.
  869. SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
  870. SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
  871. // Figure out the expansion loc range, which is the range covered by the
  872. // original _Pragma(...) sequence.
  873. std::pair<SourceLocation,SourceLocation> II =
  874. SM.getImmediateExpansionRange(FileLoc);
  875. return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
  876. }
  877. /// getSourceLocation - Return a source location identifier for the specified
  878. /// offset in the current file.
  879. SourceLocation Lexer::getSourceLocation(const char *Loc,
  880. unsigned TokLen) const {
  881. assert(Loc >= BufferStart && Loc <= BufferEnd &&
  882. "Location out of range for this buffer!");
  883. // In the normal case, we're just lexing from a simple file buffer, return
  884. // the file id from FileLoc with the offset specified.
  885. unsigned CharNo = Loc-BufferStart;
  886. if (FileLoc.isFileID())
  887. return FileLoc.getLocWithOffset(CharNo);
  888. // Otherwise, this is the _Pragma lexer case, which pretends that all of the
  889. // tokens are lexed from where the _Pragma was defined.
  890. assert(PP && "This doesn't work on raw lexers");
  891. return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
  892. }
  893. /// Diag - Forwarding function for diagnostics. This translate a source
  894. /// position in the current buffer into a SourceLocation object for rendering.
  895. DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
  896. return PP->Diag(getSourceLocation(Loc), DiagID);
  897. }
  898. //===----------------------------------------------------------------------===//
  899. // Trigraph and Escaped Newline Handling Code.
  900. //===----------------------------------------------------------------------===//
  901. /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
  902. /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
  903. static char GetTrigraphCharForLetter(char Letter) {
  904. switch (Letter) {
  905. default: return 0;
  906. case '=': return '#';
  907. case ')': return ']';
  908. case '(': return '[';
  909. case '!': return '|';
  910. case '\'': return '^';
  911. case '>': return '}';
  912. case '/': return '\\';
  913. case '<': return '{';
  914. case '-': return '~';
  915. }
  916. }
  917. /// DecodeTrigraphChar - If the specified character is a legal trigraph when
  918. /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
  919. /// return the result character. Finally, emit a warning about trigraph use
  920. /// whether trigraphs are enabled or not.
  921. static char DecodeTrigraphChar(const char *CP, Lexer *L) {
  922. char Res = GetTrigraphCharForLetter(*CP);
  923. if (!Res || !L) return Res;
  924. if (!L->getLangOpts().Trigraphs) {
  925. if (!L->isLexingRawMode())
  926. L->Diag(CP-2, diag::trigraph_ignored);
  927. return 0;
  928. }
  929. if (!L->isLexingRawMode())
  930. L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
  931. return Res;
  932. }
  933. /// getEscapedNewLineSize - Return the size of the specified escaped newline,
  934. /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
  935. /// trigraph equivalent on entry to this function.
  936. unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
  937. unsigned Size = 0;
  938. while (isWhitespace(Ptr[Size])) {
  939. ++Size;
  940. if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
  941. continue;
  942. // If this is a \r\n or \n\r, skip the other half.
  943. if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
  944. Ptr[Size-1] != Ptr[Size])
  945. ++Size;
  946. return Size;
  947. }
  948. // Not an escaped newline, must be a \t or something else.
  949. return 0;
  950. }
  951. /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
  952. /// them), skip over them and return the first non-escaped-newline found,
  953. /// otherwise return P.
  954. const char *Lexer::SkipEscapedNewLines(const char *P) {
  955. while (1) {
  956. const char *AfterEscape;
  957. if (*P == '\\') {
  958. AfterEscape = P+1;
  959. } else if (*P == '?') {
  960. // If not a trigraph for escape, bail out.
  961. if (P[1] != '?' || P[2] != '/')
  962. return P;
  963. AfterEscape = P+3;
  964. } else {
  965. return P;
  966. }
  967. unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
  968. if (NewLineSize == 0) return P;
  969. P = AfterEscape+NewLineSize;
  970. }
  971. }
  972. /// \brief Checks that the given token is the first token that occurs after the
  973. /// given location (this excludes comments and whitespace). Returns the location
  974. /// immediately after the specified token. If the token is not found or the
  975. /// location is inside a macro, the returned source location will be invalid.
  976. SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
  977. tok::TokenKind TKind,
  978. const SourceManager &SM,
  979. const LangOptions &LangOpts,
  980. bool SkipTrailingWhitespaceAndNewLine) {
  981. if (Loc.isMacroID()) {
  982. if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
  983. return SourceLocation();
  984. }
  985. Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
  986. // Break down the source location.
  987. std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
  988. // Try to load the file buffer.
  989. bool InvalidTemp = false;
  990. StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
  991. if (InvalidTemp)
  992. return SourceLocation();
  993. const char *TokenBegin = File.data() + LocInfo.second;
  994. // Lex from the start of the given location.
  995. Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
  996. TokenBegin, File.end());
  997. // Find the token.
  998. Token Tok;
  999. lexer.LexFromRawLexer(Tok);
  1000. if (Tok.isNot(TKind))
  1001. return SourceLocation();
  1002. SourceLocation TokenLoc = Tok.getLocation();
  1003. // Calculate how much whitespace needs to be skipped if any.
  1004. unsigned NumWhitespaceChars = 0;
  1005. if (SkipTrailingWhitespaceAndNewLine) {
  1006. const char *TokenEnd = SM.getCharacterData(TokenLoc) +
  1007. Tok.getLength();
  1008. unsigned char C = *TokenEnd;
  1009. while (isHorizontalWhitespace(C)) {
  1010. C = *(++TokenEnd);
  1011. NumWhitespaceChars++;
  1012. }
  1013. // Skip \r, \n, \r\n, or \n\r
  1014. if (C == '\n' || C == '\r') {
  1015. char PrevC = C;
  1016. C = *(++TokenEnd);
  1017. NumWhitespaceChars++;
  1018. if ((C == '\n' || C == '\r') && C != PrevC)
  1019. NumWhitespaceChars++;
  1020. }
  1021. }
  1022. return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
  1023. }
  1024. /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
  1025. /// get its size, and return it. This is tricky in several cases:
  1026. /// 1. If currently at the start of a trigraph, we warn about the trigraph,
  1027. /// then either return the trigraph (skipping 3 chars) or the '?',
  1028. /// depending on whether trigraphs are enabled or not.
  1029. /// 2. If this is an escaped newline (potentially with whitespace between
  1030. /// the backslash and newline), implicitly skip the newline and return
  1031. /// the char after it.
  1032. ///
  1033. /// This handles the slow/uncommon case of the getCharAndSize method. Here we
  1034. /// know that we can accumulate into Size, and that we have already incremented
  1035. /// Ptr by Size bytes.
  1036. ///
  1037. /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
  1038. /// be updated to match.
  1039. ///
  1040. char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  1041. Token *Tok) {
  1042. // If we have a slash, look for an escaped newline.
  1043. if (Ptr[0] == '\\') {
  1044. ++Size;
  1045. ++Ptr;
  1046. Slash:
  1047. // Common case, backslash-char where the char is not whitespace.
  1048. if (!isWhitespace(Ptr[0])) return '\\';
  1049. // See if we have optional whitespace characters between the slash and
  1050. // newline.
  1051. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
  1052. // Remember that this token needs to be cleaned.
  1053. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  1054. // Warn if there was whitespace between the backslash and newline.
  1055. if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
  1056. Diag(Ptr, diag::backslash_newline_space);
  1057. // Found backslash<whitespace><newline>. Parse the char after it.
  1058. Size += EscapedNewLineSize;
  1059. Ptr += EscapedNewLineSize;
  1060. // If the char that we finally got was a \n, then we must have had
  1061. // something like \<newline><newline>. We don't want to consume the
  1062. // second newline.
  1063. if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
  1064. return ' ';
  1065. // Use slow version to accumulate a correct size field.
  1066. return getCharAndSizeSlow(Ptr, Size, Tok);
  1067. }
  1068. // Otherwise, this is not an escaped newline, just return the slash.
  1069. return '\\';
  1070. }
  1071. // If this is a trigraph, process it.
  1072. if (Ptr[0] == '?' && Ptr[1] == '?') {
  1073. // If this is actually a legal trigraph (not something like "??x"), emit
  1074. // a trigraph warning. If so, and if trigraphs are enabled, return it.
  1075. if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
  1076. // Remember that this token needs to be cleaned.
  1077. if (Tok) Tok->setFlag(Token::NeedsCleaning);
  1078. Ptr += 3;
  1079. Size += 3;
  1080. if (C == '\\') goto Slash;
  1081. return C;
  1082. }
  1083. }
  1084. // If this is neither, return a single character.
  1085. ++Size;
  1086. return *Ptr;
  1087. }
  1088. /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
  1089. /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
  1090. /// and that we have already incremented Ptr by Size bytes.
  1091. ///
  1092. /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
  1093. /// be updated to match.
  1094. char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  1095. const LangOptions &LangOpts) {
  1096. // If we have a slash, look for an escaped newline.
  1097. if (Ptr[0] == '\\') {
  1098. ++Size;
  1099. ++Ptr;
  1100. Slash:
  1101. // Common case, backslash-char where the char is not whitespace.
  1102. if (!isWhitespace(Ptr[0])) return '\\';
  1103. // See if we have optional whitespace characters followed by a newline.
  1104. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
  1105. // Found backslash<whitespace><newline>. Parse the char after it.
  1106. Size += EscapedNewLineSize;
  1107. Ptr += EscapedNewLineSize;
  1108. // If the char that we finally got was a \n, then we must have had
  1109. // something like \<newline><newline>. We don't want to consume the
  1110. // second newline.
  1111. if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
  1112. return ' ';
  1113. // Use slow version to accumulate a correct size field.
  1114. return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
  1115. }
  1116. // Otherwise, this is not an escaped newline, just return the slash.
  1117. return '\\';
  1118. }
  1119. // If this is a trigraph, process it.
  1120. if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
  1121. // If this is actually a legal trigraph (not something like "??x"), return
  1122. // it.
  1123. if (char C = GetTrigraphCharForLetter(Ptr[2])) {
  1124. Ptr += 3;
  1125. Size += 3;
  1126. if (C == '\\') goto Slash;
  1127. return C;
  1128. }
  1129. }
  1130. // If this is neither, return a single character.
  1131. ++Size;
  1132. return *Ptr;
  1133. }
  1134. //===----------------------------------------------------------------------===//
  1135. // Helper methods for lexing.
  1136. //===----------------------------------------------------------------------===//
  1137. /// \brief Routine that indiscriminately skips bytes in the source file.
  1138. void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
  1139. BufferPtr += Bytes;
  1140. if (BufferPtr > BufferEnd)
  1141. BufferPtr = BufferEnd;
  1142. // FIXME: What exactly does the StartOfLine bit mean? There are two
  1143. // possible meanings for the "start" of the line: the first token on the
  1144. // unexpanded line, or the first token on the expanded line.
  1145. IsAtStartOfLine = StartOfLine;
  1146. IsAtPhysicalStartOfLine = StartOfLine;
  1147. }
  1148. static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
  1149. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  1150. static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
  1151. C11AllowedIDCharRanges);
  1152. return C11AllowedIDChars.contains(C);
  1153. } else if (LangOpts.CPlusPlus) {
  1154. static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
  1155. CXX03AllowedIDCharRanges);
  1156. return CXX03AllowedIDChars.contains(C);
  1157. } else {
  1158. static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
  1159. C99AllowedIDCharRanges);
  1160. return C99AllowedIDChars.contains(C);
  1161. }
  1162. }
  1163. static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
  1164. assert(isAllowedIDChar(C, LangOpts));
  1165. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  1166. static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
  1167. C11DisallowedInitialIDCharRanges);
  1168. return !C11DisallowedInitialIDChars.contains(C);
  1169. } else if (LangOpts.CPlusPlus) {
  1170. return true;
  1171. } else {
  1172. static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
  1173. C99DisallowedInitialIDCharRanges);
  1174. return !C99DisallowedInitialIDChars.contains(C);
  1175. }
  1176. }
  1177. static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
  1178. const char *End) {
  1179. return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
  1180. L.getSourceLocation(End));
  1181. }
  1182. static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
  1183. CharSourceRange Range, bool IsFirst) {
  1184. // Check C99 compatibility.
  1185. if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
  1186. enum {
  1187. CannotAppearInIdentifier = 0,
  1188. CannotStartIdentifier
  1189. };
  1190. static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
  1191. C99AllowedIDCharRanges);
  1192. static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
  1193. C99DisallowedInitialIDCharRanges);
  1194. if (!C99AllowedIDChars.contains(C)) {
  1195. Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
  1196. << Range
  1197. << CannotAppearInIdentifier;
  1198. } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
  1199. Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
  1200. << Range
  1201. << CannotStartIdentifier;
  1202. }
  1203. }
  1204. // Check C++98 compatibility.
  1205. if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
  1206. static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
  1207. CXX03AllowedIDCharRanges);
  1208. if (!CXX03AllowedIDChars.contains(C)) {
  1209. Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
  1210. << Range;
  1211. }
  1212. }
  1213. }
  1214. bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
  1215. Token &Result) {
  1216. const char *UCNPtr = CurPtr + Size;
  1217. uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
  1218. if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
  1219. return false;
  1220. if (!isLexingRawMode())
  1221. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
  1222. makeCharRange(*this, CurPtr, UCNPtr),
  1223. /*IsFirst=*/false);
  1224. Result.setFlag(Token::HasUCN);
  1225. if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
  1226. (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
  1227. CurPtr = UCNPtr;
  1228. else
  1229. while (CurPtr != UCNPtr)
  1230. (void)getAndAdvanceChar(CurPtr, Result);
  1231. return true;
  1232. }
  1233. bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
  1234. const char *UnicodePtr = CurPtr;
  1235. UTF32 CodePoint;
  1236. ConversionResult Result =
  1237. llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
  1238. (const UTF8 *)BufferEnd,
  1239. &CodePoint,
  1240. strictConversion);
  1241. if (Result != conversionOK ||
  1242. !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
  1243. return false;
  1244. if (!isLexingRawMode())
  1245. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
  1246. makeCharRange(*this, CurPtr, UnicodePtr),
  1247. /*IsFirst=*/false);
  1248. CurPtr = UnicodePtr;
  1249. return true;
  1250. }
  1251. bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
  1252. // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
  1253. unsigned Size;
  1254. unsigned char C = *CurPtr++;
  1255. while (isIdentifierBody(C))
  1256. C = *CurPtr++;
  1257. --CurPtr; // Back up over the skipped character.
  1258. // Fast path, no $,\,? in identifier found. '\' might be an escaped newline
  1259. // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
  1260. //
  1261. // TODO: Could merge these checks into an InfoTable flag to make the
  1262. // comparison cheaper
  1263. if (isASCII(C) && C != '\\' && C != '?' &&
  1264. (C != '$' || !LangOpts.DollarIdents)) {
  1265. FinishIdentifier:
  1266. const char *IdStart = BufferPtr;
  1267. FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
  1268. Result.setRawIdentifierData(IdStart);
  1269. // If we are in raw mode, return this identifier raw. There is no need to
  1270. // look up identifier information or attempt to macro expand it.
  1271. if (LexingRawMode)
  1272. return true;
  1273. // Fill in Result.IdentifierInfo and update the token kind,
  1274. // looking up the identifier in the identifier table.
  1275. IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
  1276. // Finally, now that we know we have an identifier, pass this off to the
  1277. // preprocessor, which may macro expand it or something.
  1278. if (II->isHandleIdentifierCase())
  1279. return PP->HandleIdentifier(Result);
  1280. return true;
  1281. }
  1282. // Otherwise, $,\,? in identifier found. Enter slower path.
  1283. C = getCharAndSize(CurPtr, Size);
  1284. while (1) {
  1285. if (C == '$') {
  1286. // If we hit a $ and they are not supported in identifiers, we are done.
  1287. if (!LangOpts.DollarIdents) goto FinishIdentifier;
  1288. // Otherwise, emit a diagnostic and continue.
  1289. if (!isLexingRawMode())
  1290. Diag(CurPtr, diag::ext_dollar_in_identifier);
  1291. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1292. C = getCharAndSize(CurPtr, Size);
  1293. continue;
  1294. } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
  1295. C = getCharAndSize(CurPtr, Size);
  1296. continue;
  1297. } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
  1298. C = getCharAndSize(CurPtr, Size);
  1299. continue;
  1300. } else if (!isIdentifierBody(C)) {
  1301. goto FinishIdentifier;
  1302. }
  1303. // Otherwise, this character is good, consume it.
  1304. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1305. C = getCharAndSize(CurPtr, Size);
  1306. while (isIdentifierBody(C)) {
  1307. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1308. C = getCharAndSize(CurPtr, Size);
  1309. }
  1310. }
  1311. }
  1312. /// isHexaLiteral - Return true if Start points to a hex constant.
  1313. /// in microsoft mode (where this is supposed to be several different tokens).
  1314. bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
  1315. unsigned Size;
  1316. char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
  1317. if (C1 != '0')
  1318. return false;
  1319. char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
  1320. return (C2 == 'x' || C2 == 'X');
  1321. }
  1322. /// LexNumericConstant - Lex the remainder of a integer or floating point
  1323. /// constant. From[-1] is the first character lexed. Return the end of the
  1324. /// constant.
  1325. bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
  1326. return LexNumericConstant(Result, CurPtr, 0);
  1327. }
  1328. bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr, unsigned Periods) {
  1329. unsigned Size;
  1330. char C = getCharAndSize(CurPtr, Size);
  1331. char PrevCh = 0;
  1332. while ((C == '#' || isPreprocessingNumberBody(C)) && !(C == '.' && Periods)) { // HLSL Change - support '1.0.xxx' floating point swizzle, and '#' for '#INF'
  1333. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1334. PrevCh = C;
  1335. // HLSL Change Begin.
  1336. // Support '1.0.xxx' floating point swizzle
  1337. if (C == '.') {
  1338. Periods++;
  1339. if (*CurPtr == 'x' || *CurPtr == 'r') {
  1340. CurPtr--;
  1341. break;
  1342. }
  1343. }
  1344. // HLSL Change End.
  1345. C = getCharAndSize(CurPtr, Size);
  1346. }
  1347. // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
  1348. if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
  1349. // If we are in Microsoft mode, don't continue if the constant is hex.
  1350. // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
  1351. if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
  1352. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result), Periods);
  1353. }
  1354. // If we have a hex FP constant, continue.
  1355. if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
  1356. // Outside C99, we accept hexadecimal floating point numbers as a
  1357. // not-quite-conforming extension. Only do so if this looks like it's
  1358. // actually meant to be a hexfloat, and not if it has a ud-suffix.
  1359. bool IsHexFloat = true;
  1360. if (!LangOpts.C99) {
  1361. if (!isHexaLiteral(BufferPtr, LangOpts))
  1362. IsHexFloat = false;
  1363. else if (std::find(BufferPtr, CurPtr, '_') != CurPtr)
  1364. IsHexFloat = false;
  1365. }
  1366. if (IsHexFloat)
  1367. return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result), Periods);
  1368. }
  1369. // If we have a digit separator, continue.
  1370. if (C == '\'' && getLangOpts().CPlusPlus14) {
  1371. unsigned NextSize;
  1372. char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
  1373. if (isIdentifierBody(Next)) {
  1374. if (!isLexingRawMode())
  1375. Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
  1376. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1377. CurPtr = ConsumeChar(CurPtr, NextSize, Result);
  1378. return LexNumericConstant(Result, CurPtr, Periods);
  1379. }
  1380. }
  1381. // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
  1382. if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
  1383. return LexNumericConstant(Result, CurPtr, Periods);
  1384. if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
  1385. return LexNumericConstant(Result, CurPtr, Periods);
  1386. // Update the location of token as well as BufferPtr.
  1387. const char *TokStart = BufferPtr;
  1388. FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
  1389. Result.setLiteralData(TokStart);
  1390. return true;
  1391. }
  1392. /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
  1393. /// in C++11, or warn on a ud-suffix in C++98.
  1394. const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
  1395. bool IsStringLiteral) {
  1396. assert(getLangOpts().CPlusPlus);
  1397. // Maximally munch an identifier.
  1398. unsigned Size;
  1399. char C = getCharAndSize(CurPtr, Size);
  1400. bool Consumed = false;
  1401. if (!isIdentifierHead(C)) {
  1402. if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
  1403. Consumed = true;
  1404. else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
  1405. Consumed = true;
  1406. else
  1407. return CurPtr;
  1408. }
  1409. if (!getLangOpts().CPlusPlus11) {
  1410. if (!isLexingRawMode())
  1411. Diag(CurPtr,
  1412. C == '_' ? diag::warn_cxx11_compat_user_defined_literal
  1413. : diag::warn_cxx11_compat_reserved_user_defined_literal)
  1414. << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
  1415. return CurPtr;
  1416. }
  1417. // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
  1418. // that does not start with an underscore is ill-formed. As a conforming
  1419. // extension, we treat all such suffixes as if they had whitespace before
  1420. // them. We assume a suffix beginning with a UCN or UTF-8 character is more
  1421. // likely to be a ud-suffix than a macro, however, and accept that.
  1422. if (!Consumed) {
  1423. bool IsUDSuffix = false;
  1424. if (C == '_')
  1425. IsUDSuffix = true;
  1426. else if (IsStringLiteral && getLangOpts().CPlusPlus14) {
  1427. // In C++1y, we need to look ahead a few characters to see if this is a
  1428. // valid suffix for a string literal or a numeric literal (this could be
  1429. // the 'operator""if' defining a numeric literal operator).
  1430. const unsigned MaxStandardSuffixLength = 3;
  1431. char Buffer[MaxStandardSuffixLength] = { C };
  1432. unsigned Consumed = Size;
  1433. unsigned Chars = 1;
  1434. while (true) {
  1435. unsigned NextSize;
  1436. char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
  1437. getLangOpts());
  1438. if (!isIdentifierBody(Next)) {
  1439. // End of suffix. Check whether this is on the whitelist.
  1440. IsUDSuffix = (Chars == 1 && Buffer[0] == 's') ||
  1441. NumericLiteralParser::isValidUDSuffix(
  1442. getLangOpts(), StringRef(Buffer, Chars));
  1443. break;
  1444. }
  1445. if (Chars == MaxStandardSuffixLength)
  1446. // Too long: can't be a standard suffix.
  1447. break;
  1448. Buffer[Chars++] = Next;
  1449. Consumed += NextSize;
  1450. }
  1451. }
  1452. if (!IsUDSuffix) {
  1453. if (!isLexingRawMode())
  1454. Diag(CurPtr, getLangOpts().MSVCCompat
  1455. ? diag::ext_ms_reserved_user_defined_literal
  1456. : diag::ext_reserved_user_defined_literal)
  1457. << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
  1458. return CurPtr;
  1459. }
  1460. CurPtr = ConsumeChar(CurPtr, Size, Result);
  1461. }
  1462. Result.setFlag(Token::HasUDSuffix);
  1463. while (true) {
  1464. C = getCharAndSize(CurPtr, Size);
  1465. if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); }
  1466. else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {}
  1467. else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {}
  1468. else break;
  1469. }
  1470. return CurPtr;
  1471. }
  1472. /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
  1473. /// either " or L" or u8" or u" or U".
  1474. bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
  1475. tok::TokenKind Kind) {
  1476. // Does this string contain the \0 character?
  1477. const char *NulCharacter = nullptr;
  1478. if (!isLexingRawMode() &&
  1479. (Kind == tok::utf8_string_literal ||
  1480. Kind == tok::utf16_string_literal ||
  1481. Kind == tok::utf32_string_literal))
  1482. Diag(BufferPtr, getLangOpts().CPlusPlus
  1483. ? diag::warn_cxx98_compat_unicode_literal
  1484. : diag::warn_c99_compat_unicode_literal);
  1485. char C = getAndAdvanceChar(CurPtr, Result);
  1486. while (C != '"') {
  1487. // Skip escaped characters. Escaped newlines will already be processed by
  1488. // getAndAdvanceChar.
  1489. if (C == '\\')
  1490. C = getAndAdvanceChar(CurPtr, Result);
  1491. if (C == '\n' || C == '\r' || // Newline.
  1492. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  1493. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1494. Diag(BufferPtr, diag::ext_unterminated_string);
  1495. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1496. return true;
  1497. }
  1498. if (C == 0) {
  1499. if (isCodeCompletionPoint(CurPtr-1)) {
  1500. PP->CodeCompleteNaturalLanguage();
  1501. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1502. cutOffLexing();
  1503. return true;
  1504. }
  1505. NulCharacter = CurPtr-1;
  1506. }
  1507. C = getAndAdvanceChar(CurPtr, Result);
  1508. }
  1509. // If we are in C++11, lex the optional ud-suffix.
  1510. if (getLangOpts().CPlusPlus)
  1511. CurPtr = LexUDSuffix(Result, CurPtr, true);
  1512. // If a nul character existed in the string, warn about it.
  1513. if (NulCharacter && !isLexingRawMode())
  1514. Diag(NulCharacter, diag::null_in_string);
  1515. // Update the location of the token as well as the BufferPtr instance var.
  1516. const char *TokStart = BufferPtr;
  1517. FormTokenWithChars(Result, CurPtr, Kind);
  1518. Result.setLiteralData(TokStart);
  1519. return true;
  1520. }
  1521. /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
  1522. /// having lexed R", LR", u8R", uR", or UR".
  1523. bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
  1524. tok::TokenKind Kind) {
  1525. // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
  1526. // Between the initial and final double quote characters of the raw string,
  1527. // any transformations performed in phases 1 and 2 (trigraphs,
  1528. // universal-character-names, and line splicing) are reverted.
  1529. if (!isLexingRawMode())
  1530. Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
  1531. unsigned PrefixLen = 0;
  1532. while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
  1533. ++PrefixLen;
  1534. // If the last character was not a '(', then we didn't lex a valid delimiter.
  1535. if (CurPtr[PrefixLen] != '(') {
  1536. if (!isLexingRawMode()) {
  1537. const char *PrefixEnd = &CurPtr[PrefixLen];
  1538. if (PrefixLen == 16) {
  1539. Diag(PrefixEnd, diag::err_raw_delim_too_long);
  1540. } else {
  1541. Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
  1542. << StringRef(PrefixEnd, 1);
  1543. }
  1544. }
  1545. // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
  1546. // it's possible the '"' was intended to be part of the raw string, but
  1547. // there's not much we can do about that.
  1548. while (1) {
  1549. char C = *CurPtr++;
  1550. if (C == '"')
  1551. break;
  1552. if (C == 0 && CurPtr-1 == BufferEnd) {
  1553. --CurPtr;
  1554. break;
  1555. }
  1556. }
  1557. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1558. return true;
  1559. }
  1560. // Save prefix and move CurPtr past it
  1561. const char *Prefix = CurPtr;
  1562. CurPtr += PrefixLen + 1; // skip over prefix and '('
  1563. while (1) {
  1564. char C = *CurPtr++;
  1565. if (C == ')') {
  1566. // Check for prefix match and closing quote.
  1567. if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
  1568. CurPtr += PrefixLen + 1; // skip over prefix and '"'
  1569. break;
  1570. }
  1571. } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
  1572. if (!isLexingRawMode())
  1573. Diag(BufferPtr, diag::err_unterminated_raw_string)
  1574. << StringRef(Prefix, PrefixLen);
  1575. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1576. return true;
  1577. }
  1578. }
  1579. // If we are in C++11, lex the optional ud-suffix.
  1580. if (getLangOpts().CPlusPlus)
  1581. CurPtr = LexUDSuffix(Result, CurPtr, true);
  1582. // Update the location of token as well as BufferPtr.
  1583. const char *TokStart = BufferPtr;
  1584. FormTokenWithChars(Result, CurPtr, Kind);
  1585. Result.setLiteralData(TokStart);
  1586. return true;
  1587. }
  1588. /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
  1589. /// after having lexed the '<' character. This is used for #include filenames.
  1590. bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
  1591. // Does this string contain the \0 character?
  1592. const char *NulCharacter = nullptr;
  1593. const char *AfterLessPos = CurPtr;
  1594. char C = getAndAdvanceChar(CurPtr, Result);
  1595. while (C != '>') {
  1596. // Skip escaped characters.
  1597. if (C == '\\' && CurPtr < BufferEnd) {
  1598. // Skip the escaped character.
  1599. getAndAdvanceChar(CurPtr, Result);
  1600. } else if (C == '\n' || C == '\r' || // Newline.
  1601. (C == 0 && (CurPtr-1 == BufferEnd || // End of file.
  1602. isCodeCompletionPoint(CurPtr-1)))) {
  1603. // If the filename is unterminated, then it must just be a lone <
  1604. // character. Return this as such.
  1605. FormTokenWithChars(Result, AfterLessPos, tok::less);
  1606. return true;
  1607. } else if (C == 0) {
  1608. NulCharacter = CurPtr-1;
  1609. }
  1610. C = getAndAdvanceChar(CurPtr, Result);
  1611. }
  1612. // If a nul character existed in the string, warn about it.
  1613. if (NulCharacter && !isLexingRawMode())
  1614. Diag(NulCharacter, diag::null_in_string);
  1615. // Update the location of token as well as BufferPtr.
  1616. const char *TokStart = BufferPtr;
  1617. FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
  1618. Result.setLiteralData(TokStart);
  1619. return true;
  1620. }
  1621. /// LexCharConstant - Lex the remainder of a character constant, after having
  1622. /// lexed either ' or L' or u8' or u' or U'.
  1623. bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
  1624. tok::TokenKind Kind) {
  1625. // Does this character contain the \0 character?
  1626. const char *NulCharacter = nullptr;
  1627. if (!isLexingRawMode()) {
  1628. if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
  1629. Diag(BufferPtr, getLangOpts().CPlusPlus
  1630. ? diag::warn_cxx98_compat_unicode_literal
  1631. : diag::warn_c99_compat_unicode_literal);
  1632. else if (Kind == tok::utf8_char_constant)
  1633. Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
  1634. }
  1635. char C = getAndAdvanceChar(CurPtr, Result);
  1636. if (C == '\'') {
  1637. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1638. Diag(BufferPtr, diag::ext_empty_character);
  1639. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1640. return true;
  1641. }
  1642. while (C != '\'') {
  1643. // Skip escaped characters.
  1644. if (C == '\\')
  1645. C = getAndAdvanceChar(CurPtr, Result);
  1646. if (C == '\n' || C == '\r' || // Newline.
  1647. (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
  1648. if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
  1649. Diag(BufferPtr, diag::ext_unterminated_char);
  1650. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1651. return true;
  1652. }
  1653. if (C == 0) {
  1654. if (isCodeCompletionPoint(CurPtr-1)) {
  1655. PP->CodeCompleteNaturalLanguage();
  1656. FormTokenWithChars(Result, CurPtr-1, tok::unknown);
  1657. cutOffLexing();
  1658. return true;
  1659. }
  1660. NulCharacter = CurPtr-1;
  1661. }
  1662. C = getAndAdvanceChar(CurPtr, Result);
  1663. }
  1664. // If we are in C++11, lex the optional ud-suffix.
  1665. if (getLangOpts().CPlusPlus)
  1666. CurPtr = LexUDSuffix(Result, CurPtr, false);
  1667. // If a nul character existed in the character, warn about it.
  1668. if (NulCharacter && !isLexingRawMode())
  1669. Diag(NulCharacter, diag::null_in_char);
  1670. // Update the location of token as well as BufferPtr.
  1671. const char *TokStart = BufferPtr;
  1672. FormTokenWithChars(Result, CurPtr, Kind);
  1673. Result.setLiteralData(TokStart);
  1674. return true;
  1675. }
  1676. /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
  1677. /// Update BufferPtr to point to the next non-whitespace character and return.
  1678. ///
  1679. /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
  1680. ///
  1681. bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
  1682. bool &TokAtPhysicalStartOfLine) {
  1683. // Whitespace - Skip it, then return the token after the whitespace.
  1684. bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
  1685. unsigned char Char = *CurPtr;
  1686. // Skip consecutive spaces efficiently.
  1687. while (1) {
  1688. // Skip horizontal whitespace very aggressively.
  1689. while (isHorizontalWhitespace(Char))
  1690. Char = *++CurPtr;
  1691. // Otherwise if we have something other than whitespace, we're done.
  1692. if (!isVerticalWhitespace(Char))
  1693. break;
  1694. if (ParsingPreprocessorDirective) {
  1695. // End of preprocessor directive line, let LexTokenInternal handle this.
  1696. BufferPtr = CurPtr;
  1697. return false;
  1698. }
  1699. // OK, but handle newline.
  1700. SawNewline = true;
  1701. Char = *++CurPtr;
  1702. }
  1703. // If the client wants us to return whitespace, return it now.
  1704. if (isKeepWhitespaceMode()) {
  1705. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1706. if (SawNewline) {
  1707. IsAtStartOfLine = true;
  1708. IsAtPhysicalStartOfLine = true;
  1709. }
  1710. // FIXME: The next token will not have LeadingSpace set.
  1711. return true;
  1712. }
  1713. // If this isn't immediately after a newline, there is leading space.
  1714. char PrevChar = CurPtr[-1];
  1715. bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
  1716. Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
  1717. if (SawNewline) {
  1718. Result.setFlag(Token::StartOfLine);
  1719. TokAtPhysicalStartOfLine = true;
  1720. }
  1721. BufferPtr = CurPtr;
  1722. return false;
  1723. }
  1724. /// We have just read the // characters from input. Skip until we find the
  1725. /// newline character thats terminate the comment. Then update BufferPtr and
  1726. /// return.
  1727. ///
  1728. /// If we're in KeepCommentMode or any CommentHandler has inserted
  1729. /// some tokens, this will store the first token and return true.
  1730. bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
  1731. bool &TokAtPhysicalStartOfLine) {
  1732. // If Line comments aren't explicitly enabled for this language, emit an
  1733. // extension warning.
  1734. #ifdef MS_SUPPORT_VARIABLE_LANGOPTS
  1735. if (!LangOpts.LineComment && !isLexingRawMode()) {
  1736. Diag(BufferPtr, diag::ext_line_comment);
  1737. // Mark them enabled so we only emit one warning for this translation
  1738. // unit.
  1739. LangOpts.LineComment = true;
  1740. }
  1741. #else
  1742. assert(LangOpts.LineComment);
  1743. #endif
  1744. // Scan over the body of the comment. The common case, when scanning, is that
  1745. // the comment contains normal ascii characters with nothing interesting in
  1746. // them. As such, optimize for this case with the inner loop.
  1747. char C;
  1748. do {
  1749. C = *CurPtr;
  1750. // Skip over characters in the fast loop.
  1751. while (C != 0 && // Potentially EOF.
  1752. C != '\n' && C != '\r') // Newline or DOS-style newline.
  1753. C = *++CurPtr;
  1754. const char *NextLine = CurPtr;
  1755. if (C != 0) {
  1756. // We found a newline, see if it's escaped.
  1757. const char *EscapePtr = CurPtr-1;
  1758. bool HasSpace = false;
  1759. while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
  1760. --EscapePtr;
  1761. HasSpace = true;
  1762. }
  1763. if (*EscapePtr == '\\') // Escaped newline.
  1764. CurPtr = EscapePtr;
  1765. else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
  1766. EscapePtr[-2] == '?') // Trigraph-escaped newline.
  1767. CurPtr = EscapePtr-2;
  1768. else
  1769. break; // This is a newline, we're done.
  1770. // If there was space between the backslash and newline, warn about it.
  1771. if (HasSpace && !isLexingRawMode())
  1772. Diag(EscapePtr, diag::backslash_newline_space);
  1773. }
  1774. // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
  1775. // properly decode the character. Read it in raw mode to avoid emitting
  1776. // diagnostics about things like trigraphs. If we see an escaped newline,
  1777. // we'll handle it below.
  1778. const char *OldPtr = CurPtr;
  1779. bool OldRawMode = isLexingRawMode();
  1780. LexingRawMode = true;
  1781. C = getAndAdvanceChar(CurPtr, Result);
  1782. LexingRawMode = OldRawMode;
  1783. // If we only read only one character, then no special handling is needed.
  1784. // We're done and can skip forward to the newline.
  1785. if (C != 0 && CurPtr == OldPtr+1) {
  1786. CurPtr = NextLine;
  1787. break;
  1788. }
  1789. // If we read multiple characters, and one of those characters was a \r or
  1790. // \n, then we had an escaped newline within the comment. Emit diagnostic
  1791. // unless the next line is also a // comment.
  1792. if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
  1793. for (; OldPtr != CurPtr; ++OldPtr)
  1794. if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
  1795. // Okay, we found a // comment that ends in a newline, if the next
  1796. // line is also a // comment, but has spaces, don't emit a diagnostic.
  1797. if (isWhitespace(C)) {
  1798. const char *ForwardPtr = CurPtr;
  1799. while (isWhitespace(*ForwardPtr)) // Skip whitespace.
  1800. ++ForwardPtr;
  1801. if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
  1802. break;
  1803. }
  1804. if (!isLexingRawMode())
  1805. Diag(OldPtr-1, diag::ext_multi_line_line_comment);
  1806. break;
  1807. }
  1808. }
  1809. if (CurPtr == BufferEnd+1) {
  1810. --CurPtr;
  1811. break;
  1812. }
  1813. if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
  1814. PP->CodeCompleteNaturalLanguage();
  1815. cutOffLexing();
  1816. return false;
  1817. }
  1818. } while (C != '\n' && C != '\r');
  1819. // Found but did not consume the newline. Notify comment handlers about the
  1820. // comment unless we're in a #if 0 block.
  1821. if (PP && !isLexingRawMode() &&
  1822. PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
  1823. getSourceLocation(CurPtr)))) {
  1824. BufferPtr = CurPtr;
  1825. return true; // A token has to be returned.
  1826. }
  1827. // If we are returning comments as tokens, return this comment as a token.
  1828. if (inKeepCommentMode())
  1829. return SaveLineComment(Result, CurPtr);
  1830. // If we are inside a preprocessor directive and we see the end of line,
  1831. // return immediately, so that the lexer can return this as an EOD token.
  1832. if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
  1833. BufferPtr = CurPtr;
  1834. return false;
  1835. }
  1836. // Otherwise, eat the \n character. We don't care if this is a \n\r or
  1837. // \r\n sequence. This is an efficiency hack (because we know the \n can't
  1838. // contribute to another token), it isn't needed for correctness. Note that
  1839. // this is ok even in KeepWhitespaceMode, because we would have returned the
  1840. /// comment above in that mode.
  1841. ++CurPtr;
  1842. // The next returned token is at the start of the line.
  1843. Result.setFlag(Token::StartOfLine);
  1844. TokAtPhysicalStartOfLine = true;
  1845. // No leading whitespace seen so far.
  1846. Result.clearFlag(Token::LeadingSpace);
  1847. BufferPtr = CurPtr;
  1848. return false;
  1849. }
  1850. /// If in save-comment mode, package up this Line comment in an appropriate
  1851. /// way and return it.
  1852. bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
  1853. // If we're not in a preprocessor directive, just return the // comment
  1854. // directly.
  1855. FormTokenWithChars(Result, CurPtr, tok::comment);
  1856. if (!ParsingPreprocessorDirective || LexingRawMode)
  1857. return true;
  1858. // If this Line-style comment is in a macro definition, transmogrify it into
  1859. // a C-style block comment.
  1860. bool Invalid = false;
  1861. std::string Spelling = PP->getSpelling(Result, &Invalid);
  1862. if (Invalid)
  1863. return true;
  1864. assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
  1865. Spelling[1] = '*'; // Change prefix to "/*".
  1866. Spelling += "*/"; // add suffix.
  1867. Result.setKind(tok::comment);
  1868. PP->CreateString(Spelling, Result,
  1869. Result.getLocation(), Result.getLocation());
  1870. return true;
  1871. }
  1872. /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
  1873. /// character (either \\n or \\r) is part of an escaped newline sequence. Issue
  1874. /// a diagnostic if so. We know that the newline is inside of a block comment.
  1875. static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
  1876. Lexer *L) {
  1877. assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
  1878. // Back up off the newline.
  1879. --CurPtr;
  1880. // If this is a two-character newline sequence, skip the other character.
  1881. if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
  1882. // \n\n or \r\r -> not escaped newline.
  1883. if (CurPtr[0] == CurPtr[1])
  1884. return false;
  1885. // \n\r or \r\n -> skip the newline.
  1886. --CurPtr;
  1887. }
  1888. // If we have horizontal whitespace, skip over it. We allow whitespace
  1889. // between the slash and newline.
  1890. bool HasSpace = false;
  1891. while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
  1892. --CurPtr;
  1893. HasSpace = true;
  1894. }
  1895. // If we have a slash, we know this is an escaped newline.
  1896. if (*CurPtr == '\\') {
  1897. if (CurPtr[-1] != '*') return false;
  1898. } else {
  1899. // It isn't a slash, is it the ?? / trigraph?
  1900. if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
  1901. CurPtr[-3] != '*')
  1902. return false;
  1903. // This is the trigraph ending the comment. Emit a stern warning!
  1904. CurPtr -= 2;
  1905. // If no trigraphs are enabled, warn that we ignored this trigraph and
  1906. // ignore this * character.
  1907. if (!L->getLangOpts().Trigraphs) {
  1908. if (!L->isLexingRawMode())
  1909. L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
  1910. return false;
  1911. }
  1912. if (!L->isLexingRawMode())
  1913. L->Diag(CurPtr, diag::trigraph_ends_block_comment);
  1914. }
  1915. // Warn about having an escaped newline between the */ characters.
  1916. if (!L->isLexingRawMode())
  1917. L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
  1918. // If there was space between the backslash and newline, warn about it.
  1919. if (HasSpace && !L->isLexingRawMode())
  1920. L->Diag(CurPtr, diag::backslash_newline_space);
  1921. return true;
  1922. }
  1923. #ifdef __SSE2__
  1924. #include <emmintrin.h>
  1925. #elif __ALTIVEC__
  1926. #include <altivec.h>
  1927. #undef bool
  1928. #endif
  1929. /// We have just read from input the / and * characters that started a comment.
  1930. /// Read until we find the * and / characters that terminate the comment.
  1931. /// Note that we don't bother decoding trigraphs or escaped newlines in block
  1932. /// comments, because they cannot cause the comment to end. The only thing
  1933. /// that can happen is the comment could end with an escaped newline between
  1934. /// the terminating * and /.
  1935. ///
  1936. /// If we're in KeepCommentMode or any CommentHandler has inserted
  1937. /// some tokens, this will store the first token and return true.
  1938. bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
  1939. bool &TokAtPhysicalStartOfLine) {
  1940. // Scan one character past where we should, looking for a '/' character. Once
  1941. // we find it, check to see if it was preceded by a *. This common
  1942. // optimization helps people who like to put a lot of * characters in their
  1943. // comments.
  1944. // The first character we get with newlines and trigraphs skipped to handle
  1945. // the degenerate /*/ case below correctly if the * has an escaped newline
  1946. // after it.
  1947. unsigned CharSize;
  1948. unsigned char C = getCharAndSize(CurPtr, CharSize);
  1949. CurPtr += CharSize;
  1950. if (C == 0 && CurPtr == BufferEnd+1) {
  1951. if (!isLexingRawMode())
  1952. Diag(BufferPtr, diag::err_unterminated_block_comment);
  1953. --CurPtr;
  1954. // KeepWhitespaceMode should return this broken comment as a token. Since
  1955. // it isn't a well formed comment, just return it as an 'unknown' token.
  1956. if (isKeepWhitespaceMode()) {
  1957. FormTokenWithChars(Result, CurPtr, tok::unknown);
  1958. return true;
  1959. }
  1960. BufferPtr = CurPtr;
  1961. return false;
  1962. }
  1963. // Check to see if the first character after the '/*' is another /. If so,
  1964. // then this slash does not end the block comment, it is part of it.
  1965. if (C == '/')
  1966. C = *CurPtr++;
  1967. while (1) {
  1968. // Skip over all non-interesting characters until we find end of buffer or a
  1969. // (probably ending) '/' character.
  1970. if (CurPtr + 24 < BufferEnd &&
  1971. // If there is a code-completion point avoid the fast scan because it
  1972. // doesn't check for '\0'.
  1973. !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
  1974. // While not aligned to a 16-byte boundary.
  1975. while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
  1976. C = *CurPtr++;
  1977. if (C == '/') goto FoundSlash;
  1978. #ifdef __SSE2__
  1979. __m128i Slashes = _mm_set1_epi8('/');
  1980. while (CurPtr+16 <= BufferEnd) {
  1981. int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
  1982. Slashes));
  1983. if (cmp != 0) {
  1984. // Adjust the pointer to point directly after the first slash. It's
  1985. // not necessary to set C here, it will be overwritten at the end of
  1986. // the outer loop.
  1987. CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
  1988. goto FoundSlash;
  1989. }
  1990. CurPtr += 16;
  1991. }
  1992. #elif __ALTIVEC__
  1993. __vector unsigned char Slashes = {
  1994. '/', '/', '/', '/', '/', '/', '/', '/',
  1995. '/', '/', '/', '/', '/', '/', '/', '/'
  1996. };
  1997. while (CurPtr+16 <= BufferEnd &&
  1998. !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes))
  1999. CurPtr += 16;
  2000. #else
  2001. // Scan for '/' quickly. Many block comments are very large.
  2002. while (CurPtr[0] != '/' &&
  2003. CurPtr[1] != '/' &&
  2004. CurPtr[2] != '/' &&
  2005. CurPtr[3] != '/' &&
  2006. CurPtr+4 < BufferEnd) {
  2007. CurPtr += 4;
  2008. }
  2009. #endif
  2010. // It has to be one of the bytes scanned, increment to it and read one.
  2011. C = *CurPtr++;
  2012. }
  2013. // Loop to scan the remainder.
  2014. while (C != '/' && C != '\0')
  2015. C = *CurPtr++;
  2016. if (C == '/') {
  2017. FoundSlash:
  2018. if (CurPtr[-2] == '*') // We found the final */. We're done!
  2019. break;
  2020. if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
  2021. if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
  2022. // We found the final */, though it had an escaped newline between the
  2023. // * and /. We're done!
  2024. break;
  2025. }
  2026. }
  2027. if (CurPtr[0] == '*' && CurPtr[1] != '/') {
  2028. // If this is a /* inside of the comment, emit a warning. Don't do this
  2029. // if this is a /*/, which will end the comment. This misses cases with
  2030. // embedded escaped newlines, but oh well.
  2031. if (!isLexingRawMode())
  2032. Diag(CurPtr-1, diag::warn_nested_block_comment);
  2033. }
  2034. } else if (C == 0 && CurPtr == BufferEnd+1) {
  2035. if (!isLexingRawMode())
  2036. Diag(BufferPtr, diag::err_unterminated_block_comment);
  2037. // Note: the user probably forgot a */. We could continue immediately
  2038. // after the /*, but this would involve lexing a lot of what really is the
  2039. // comment, which surely would confuse the parser.
  2040. --CurPtr;
  2041. // KeepWhitespaceMode should return this broken comment as a token. Since
  2042. // it isn't a well formed comment, just return it as an 'unknown' token.
  2043. if (isKeepWhitespaceMode()) {
  2044. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2045. return true;
  2046. }
  2047. BufferPtr = CurPtr;
  2048. return false;
  2049. } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
  2050. PP->CodeCompleteNaturalLanguage();
  2051. cutOffLexing();
  2052. return false;
  2053. }
  2054. C = *CurPtr++;
  2055. }
  2056. // Notify comment handlers about the comment unless we're in a #if 0 block.
  2057. if (PP && !isLexingRawMode() &&
  2058. PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
  2059. getSourceLocation(CurPtr)))) {
  2060. BufferPtr = CurPtr;
  2061. return true; // A token has to be returned.
  2062. }
  2063. // If we are returning comments as tokens, return this comment as a token.
  2064. if (inKeepCommentMode()) {
  2065. FormTokenWithChars(Result, CurPtr, tok::comment);
  2066. return true;
  2067. }
  2068. // It is common for the tokens immediately after a /**/ comment to be
  2069. // whitespace. Instead of going through the big switch, handle it
  2070. // efficiently now. This is safe even in KeepWhitespaceMode because we would
  2071. // have already returned above with the comment as a token.
  2072. if (isHorizontalWhitespace(*CurPtr)) {
  2073. SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
  2074. return false;
  2075. }
  2076. // Otherwise, just return so that the next character will be lexed as a token.
  2077. BufferPtr = CurPtr;
  2078. Result.setFlag(Token::LeadingSpace);
  2079. return false;
  2080. }
  2081. //===----------------------------------------------------------------------===//
  2082. // Primary Lexing Entry Points
  2083. //===----------------------------------------------------------------------===//
  2084. /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  2085. /// uninterpreted string. This switches the lexer out of directive mode.
  2086. void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
  2087. assert(ParsingPreprocessorDirective && ParsingFilename == false &&
  2088. "Must be in a preprocessing directive!");
  2089. Token Tmp;
  2090. // CurPtr - Cache BufferPtr in an automatic variable.
  2091. const char *CurPtr = BufferPtr;
  2092. while (1) {
  2093. char Char = getAndAdvanceChar(CurPtr, Tmp);
  2094. switch (Char) {
  2095. default:
  2096. if (Result)
  2097. Result->push_back(Char);
  2098. break;
  2099. case 0: // Null.
  2100. // Found end of file?
  2101. if (CurPtr-1 != BufferEnd) {
  2102. if (isCodeCompletionPoint(CurPtr-1)) {
  2103. PP->CodeCompleteNaturalLanguage();
  2104. cutOffLexing();
  2105. return;
  2106. }
  2107. // Nope, normal character, continue.
  2108. if (Result)
  2109. Result->push_back(Char);
  2110. break;
  2111. }
  2112. // FALL THROUGH.
  2113. case '\r':
  2114. case '\n':
  2115. // Okay, we found the end of the line. First, back up past the \0, \r, \n.
  2116. assert(CurPtr[-1] == Char && "Trigraphs for newline?");
  2117. BufferPtr = CurPtr-1;
  2118. // Next, lex the character, which should handle the EOD transition.
  2119. Lex(Tmp);
  2120. if (Tmp.is(tok::code_completion)) {
  2121. if (PP)
  2122. PP->CodeCompleteNaturalLanguage();
  2123. Lex(Tmp);
  2124. }
  2125. assert(Tmp.is(tok::eod) && "Unexpected token!");
  2126. // Finally, we're done;
  2127. return;
  2128. }
  2129. }
  2130. }
  2131. /// LexEndOfFile - CurPtr points to the end of this file. Handle this
  2132. /// condition, reporting diagnostics and handling other edge cases as required.
  2133. /// This returns true if Result contains a token, false if PP.Lex should be
  2134. /// called again.
  2135. bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
  2136. // If we hit the end of the file while parsing a preprocessor directive,
  2137. // end the preprocessor directive first. The next token returned will
  2138. // then be the end of file.
  2139. if (ParsingPreprocessorDirective) {
  2140. // Done parsing the "line".
  2141. ParsingPreprocessorDirective = false;
  2142. // Update the location of token as well as BufferPtr.
  2143. FormTokenWithChars(Result, CurPtr, tok::eod);
  2144. // Restore comment saving mode, in case it was disabled for directive.
  2145. if (PP)
  2146. resetExtendedTokenMode();
  2147. return true; // Have a token.
  2148. }
  2149. // If we are in raw mode, return this event as an EOF token. Let the caller
  2150. // that put us in raw mode handle the event.
  2151. if (isLexingRawMode()) {
  2152. Result.startToken();
  2153. BufferPtr = BufferEnd;
  2154. FormTokenWithChars(Result, BufferEnd, tok::eof);
  2155. return true;
  2156. }
  2157. // Issue diagnostics for unterminated #if and missing newline.
  2158. // If we are in a #if directive, emit an error.
  2159. while (!ConditionalStack.empty()) {
  2160. if (PP->getCodeCompletionFileLoc() != FileLoc)
  2161. PP->Diag(ConditionalStack.back().IfLoc,
  2162. diag::err_pp_unterminated_conditional);
  2163. ConditionalStack.pop_back();
  2164. }
  2165. // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
  2166. // a pedwarn.
  2167. if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
  2168. DiagnosticsEngine &Diags = PP->getDiagnostics();
  2169. SourceLocation EndLoc = getSourceLocation(BufferEnd);
  2170. unsigned DiagID;
  2171. if (LangOpts.CPlusPlus11) {
  2172. // C++11 [lex.phases] 2.2 p2
  2173. // Prefer the C++98 pedantic compatibility warning over the generic,
  2174. // non-extension, user-requested "missing newline at EOF" warning.
  2175. if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
  2176. DiagID = diag::warn_cxx98_compat_no_newline_eof;
  2177. } else {
  2178. DiagID = diag::warn_no_newline_eof;
  2179. }
  2180. } else {
  2181. DiagID = diag::ext_no_newline_eof;
  2182. }
  2183. Diag(BufferEnd, DiagID)
  2184. << FixItHint::CreateInsertion(EndLoc, "\n");
  2185. }
  2186. BufferPtr = CurPtr;
  2187. // Finally, let the preprocessor handle this.
  2188. return PP->HandleEndOfFile(Result, isPragmaLexer());
  2189. }
  2190. /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
  2191. /// the specified lexer will return a tok::l_paren token, 0 if it is something
  2192. /// else and 2 if there are no more tokens in the buffer controlled by the
  2193. /// lexer.
  2194. unsigned Lexer::isNextPPTokenLParen() {
  2195. assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
  2196. // Switch to 'skipping' mode. This will ensure that we can lex a token
  2197. // without emitting diagnostics, disables macro expansion, and will cause EOF
  2198. // to return an EOF token instead of popping the include stack.
  2199. LexingRawMode = true;
  2200. // Save state that can be changed while lexing so that we can restore it.
  2201. const char *TmpBufferPtr = BufferPtr;
  2202. bool inPPDirectiveMode = ParsingPreprocessorDirective;
  2203. bool atStartOfLine = IsAtStartOfLine;
  2204. bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
  2205. bool leadingSpace = HasLeadingSpace;
  2206. Token Tok;
  2207. Lex(Tok);
  2208. // Restore state that may have changed.
  2209. BufferPtr = TmpBufferPtr;
  2210. ParsingPreprocessorDirective = inPPDirectiveMode;
  2211. HasLeadingSpace = leadingSpace;
  2212. IsAtStartOfLine = atStartOfLine;
  2213. IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
  2214. // Restore the lexer back to non-skipping mode.
  2215. LexingRawMode = false;
  2216. if (Tok.is(tok::eof))
  2217. return 2;
  2218. return Tok.is(tok::l_paren);
  2219. }
  2220. /// \brief Find the end of a version control conflict marker.
  2221. static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
  2222. ConflictMarkerKind CMK) {
  2223. const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
  2224. size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
  2225. StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
  2226. size_t Pos = RestOfBuffer.find(Terminator);
  2227. while (Pos != StringRef::npos) {
  2228. // Must occur at start of line.
  2229. if (Pos == 0 ||
  2230. (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
  2231. RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
  2232. Pos = RestOfBuffer.find(Terminator);
  2233. continue;
  2234. }
  2235. return RestOfBuffer.data()+Pos;
  2236. }
  2237. return nullptr;
  2238. }
  2239. /// IsStartOfConflictMarker - If the specified pointer is the start of a version
  2240. /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
  2241. /// and recover nicely. This returns true if it is a conflict marker and false
  2242. /// if not.
  2243. bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
  2244. // Only a conflict marker if it starts at the beginning of a line.
  2245. if (CurPtr != BufferStart &&
  2246. CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
  2247. return false;
  2248. // Check to see if we have <<<<<<< or >>>>.
  2249. if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
  2250. (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
  2251. return false;
  2252. // If we have a situation where we don't care about conflict markers, ignore
  2253. // it.
  2254. if (CurrentConflictMarkerState || isLexingRawMode())
  2255. return false;
  2256. ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
  2257. // Check to see if there is an ending marker somewhere in the buffer at the
  2258. // start of a line to terminate this conflict marker.
  2259. if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
  2260. // We found a match. We are really in a conflict marker.
  2261. // Diagnose this, and ignore to the end of line.
  2262. Diag(CurPtr, diag::err_conflict_marker);
  2263. CurrentConflictMarkerState = Kind;
  2264. // Skip ahead to the end of line. We know this exists because the
  2265. // end-of-conflict marker starts with \r or \n.
  2266. while (*CurPtr != '\r' && *CurPtr != '\n') {
  2267. assert(CurPtr != BufferEnd && "Didn't find end of line");
  2268. ++CurPtr;
  2269. }
  2270. BufferPtr = CurPtr;
  2271. return true;
  2272. }
  2273. // No end of conflict marker found.
  2274. return false;
  2275. }
  2276. /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
  2277. /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
  2278. /// is the end of a conflict marker. Handle it by ignoring up until the end of
  2279. /// the line. This returns true if it is a conflict marker and false if not.
  2280. bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
  2281. // Only a conflict marker if it starts at the beginning of a line.
  2282. if (CurPtr != BufferStart &&
  2283. CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
  2284. return false;
  2285. // If we have a situation where we don't care about conflict markers, ignore
  2286. // it.
  2287. if (!CurrentConflictMarkerState || isLexingRawMode())
  2288. return false;
  2289. // Check to see if we have the marker (4 characters in a row).
  2290. for (unsigned i = 1; i != 4; ++i)
  2291. if (CurPtr[i] != CurPtr[0])
  2292. return false;
  2293. // If we do have it, search for the end of the conflict marker. This could
  2294. // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
  2295. // be the end of conflict marker.
  2296. if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
  2297. CurrentConflictMarkerState)) {
  2298. CurPtr = End;
  2299. // Skip ahead to the end of line.
  2300. while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
  2301. ++CurPtr;
  2302. BufferPtr = CurPtr;
  2303. // No longer in the conflict marker.
  2304. CurrentConflictMarkerState = CMK_None;
  2305. return true;
  2306. }
  2307. return false;
  2308. }
  2309. bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
  2310. if (PP && PP->isCodeCompletionEnabled()) {
  2311. SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
  2312. return Loc == PP->getCodeCompletionLoc();
  2313. }
  2314. return false;
  2315. }
  2316. uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
  2317. Token *Result) {
  2318. unsigned CharSize;
  2319. char Kind = getCharAndSize(StartPtr, CharSize);
  2320. unsigned NumHexDigits;
  2321. if (Kind == 'u')
  2322. NumHexDigits = 4;
  2323. else if (Kind == 'U')
  2324. NumHexDigits = 8;
  2325. else
  2326. return 0;
  2327. if (!LangOpts.CPlusPlus && !LangOpts.C99) {
  2328. if (Result && !isLexingRawMode())
  2329. Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
  2330. return 0;
  2331. }
  2332. const char *CurPtr = StartPtr + CharSize;
  2333. const char *KindLoc = &CurPtr[-1];
  2334. uint32_t CodePoint = 0;
  2335. for (unsigned i = 0; i < NumHexDigits; ++i) {
  2336. char C = getCharAndSize(CurPtr, CharSize);
  2337. unsigned Value = llvm::hexDigitValue(C);
  2338. if (Value == -1U) {
  2339. if (Result && !isLexingRawMode()) {
  2340. if (i == 0) {
  2341. Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
  2342. << StringRef(KindLoc, 1);
  2343. } else {
  2344. Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
  2345. // If the user wrote \U1234, suggest a fixit to \u.
  2346. if (i == 4 && NumHexDigits == 8) {
  2347. CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
  2348. Diag(KindLoc, diag::note_ucn_four_not_eight)
  2349. << FixItHint::CreateReplacement(URange, "u");
  2350. }
  2351. }
  2352. }
  2353. return 0;
  2354. }
  2355. CodePoint <<= 4;
  2356. CodePoint += Value;
  2357. CurPtr += CharSize;
  2358. }
  2359. if (Result) {
  2360. Result->setFlag(Token::HasUCN);
  2361. if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
  2362. StartPtr = CurPtr;
  2363. else
  2364. while (StartPtr != CurPtr)
  2365. (void)getAndAdvanceChar(StartPtr, *Result);
  2366. } else {
  2367. StartPtr = CurPtr;
  2368. }
  2369. // Don't apply C family restrictions to UCNs in assembly mode
  2370. if (LangOpts.AsmPreprocessor)
  2371. return CodePoint;
  2372. // C99 6.4.3p2: A universal character name shall not specify a character whose
  2373. // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
  2374. // 0060 (`), nor one in the range D800 through DFFF inclusive.)
  2375. // C++11 [lex.charset]p2: If the hexadecimal value for a
  2376. // universal-character-name corresponds to a surrogate code point (in the
  2377. // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
  2378. // if the hexadecimal value for a universal-character-name outside the
  2379. // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
  2380. // string literal corresponds to a control character (in either of the
  2381. // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
  2382. // basic source character set, the program is ill-formed.
  2383. if (CodePoint < 0xA0) {
  2384. if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
  2385. return CodePoint;
  2386. // We don't use isLexingRawMode() here because we need to warn about bad
  2387. // UCNs even when skipping preprocessing tokens in a #if block.
  2388. if (Result && PP) {
  2389. if (CodePoint < 0x20 || CodePoint >= 0x7F)
  2390. Diag(BufferPtr, diag::err_ucn_control_character);
  2391. else {
  2392. char C = static_cast<char>(CodePoint);
  2393. Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
  2394. }
  2395. }
  2396. return 0;
  2397. } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
  2398. // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
  2399. // We don't use isLexingRawMode() here because we need to diagnose bad
  2400. // UCNs even when skipping preprocessing tokens in a #if block.
  2401. if (Result && PP) {
  2402. if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
  2403. Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
  2404. else
  2405. Diag(BufferPtr, diag::err_ucn_escape_invalid);
  2406. }
  2407. return 0;
  2408. }
  2409. return CodePoint;
  2410. }
  2411. bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
  2412. const char *CurPtr) {
  2413. static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
  2414. UnicodeWhitespaceCharRanges);
  2415. if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
  2416. UnicodeWhitespaceChars.contains(C)) {
  2417. Diag(BufferPtr, diag::ext_unicode_whitespace)
  2418. << makeCharRange(*this, BufferPtr, CurPtr);
  2419. Result.setFlag(Token::LeadingSpace);
  2420. return true;
  2421. }
  2422. return false;
  2423. }
  2424. bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
  2425. if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
  2426. if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
  2427. !PP->isPreprocessedOutput()) {
  2428. maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
  2429. makeCharRange(*this, BufferPtr, CurPtr),
  2430. /*IsFirst=*/true);
  2431. }
  2432. MIOpt.ReadToken();
  2433. return LexIdentifier(Result, CurPtr);
  2434. }
  2435. if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
  2436. !PP->isPreprocessedOutput() &&
  2437. !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
  2438. // Non-ASCII characters tend to creep into source code unintentionally.
  2439. // Instead of letting the parser complain about the unknown token,
  2440. // just drop the character.
  2441. // Note that we can /only/ do this when the non-ASCII character is actually
  2442. // spelled as Unicode, not written as a UCN. The standard requires that
  2443. // we not throw away any possible preprocessor tokens, but there's a
  2444. // loophole in the mapping of Unicode characters to basic character set
  2445. // characters that allows us to map these particular characters to, say,
  2446. // whitespace.
  2447. Diag(BufferPtr, diag::err_non_ascii)
  2448. << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
  2449. BufferPtr = CurPtr;
  2450. return false;
  2451. }
  2452. // Otherwise, we have an explicit UCN or a character that's unlikely to show
  2453. // up by accident.
  2454. MIOpt.ReadToken();
  2455. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2456. return true;
  2457. }
  2458. void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
  2459. IsAtStartOfLine = Result.isAtStartOfLine();
  2460. HasLeadingSpace = Result.hasLeadingSpace();
  2461. HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
  2462. // Note that this doesn't affect IsAtPhysicalStartOfLine.
  2463. }
  2464. bool Lexer::Lex(Token &Result) {
  2465. // Start a new token.
  2466. Result.startToken();
  2467. // Set up misc whitespace flags for LexTokenInternal.
  2468. if (IsAtStartOfLine) {
  2469. Result.setFlag(Token::StartOfLine);
  2470. IsAtStartOfLine = false;
  2471. }
  2472. if (HasLeadingSpace) {
  2473. Result.setFlag(Token::LeadingSpace);
  2474. HasLeadingSpace = false;
  2475. }
  2476. if (HasLeadingEmptyMacro) {
  2477. Result.setFlag(Token::LeadingEmptyMacro);
  2478. HasLeadingEmptyMacro = false;
  2479. }
  2480. bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
  2481. IsAtPhysicalStartOfLine = false;
  2482. bool isRawLex = isLexingRawMode();
  2483. (void) isRawLex;
  2484. bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
  2485. // (After the LexTokenInternal call, the lexer might be destroyed.)
  2486. assert((returnedToken || !isRawLex) && "Raw lex must succeed");
  2487. return returnedToken;
  2488. }
  2489. /// LexTokenInternal - This implements a simple C family lexer. It is an
  2490. /// extremely performance critical piece of code. This assumes that the buffer
  2491. /// has a null character at the end of the file. This returns a preprocessing
  2492. /// token, not a normal token, as such, it is an internal interface. It assumes
  2493. /// that the Flags of result have been cleared before calling this.
  2494. bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
  2495. LexNextToken:
  2496. // New token, can't need cleaning yet.
  2497. Result.clearFlag(Token::NeedsCleaning);
  2498. Result.setIdentifierInfo(nullptr);
  2499. // CurPtr - Cache BufferPtr in an automatic variable.
  2500. const char *CurPtr = BufferPtr;
  2501. // Small amounts of horizontal whitespace is very common between tokens.
  2502. if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
  2503. ++CurPtr;
  2504. while ((*CurPtr == ' ') || (*CurPtr == '\t'))
  2505. ++CurPtr;
  2506. // If we are keeping whitespace and other tokens, just return what we just
  2507. // skipped. The next lexer invocation will return the token after the
  2508. // whitespace.
  2509. if (isKeepWhitespaceMode()) {
  2510. FormTokenWithChars(Result, CurPtr, tok::unknown);
  2511. // FIXME: The next token will not have LeadingSpace set.
  2512. return true;
  2513. }
  2514. BufferPtr = CurPtr;
  2515. Result.setFlag(Token::LeadingSpace);
  2516. }
  2517. unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
  2518. // Read a character, advancing over it.
  2519. char Char = getAndAdvanceChar(CurPtr, Result);
  2520. tok::TokenKind Kind;
  2521. switch (Char) {
  2522. case 0: // Null.
  2523. // Found end of file?
  2524. if (CurPtr-1 == BufferEnd)
  2525. return LexEndOfFile(Result, CurPtr-1);
  2526. // Check if we are performing code completion.
  2527. if (isCodeCompletionPoint(CurPtr-1)) {
  2528. // Return the code-completion token.
  2529. Result.startToken();
  2530. FormTokenWithChars(Result, CurPtr, tok::code_completion);
  2531. return true;
  2532. }
  2533. if (!isLexingRawMode())
  2534. Diag(CurPtr-1, diag::null_in_file);
  2535. Result.setFlag(Token::LeadingSpace);
  2536. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2537. return true; // KeepWhitespaceMode
  2538. // We know the lexer hasn't changed, so just try again with this lexer.
  2539. // (We manually eliminate the tail call to avoid recursion.)
  2540. goto LexNextToken;
  2541. case 26: // DOS & CP/M EOF: "^Z".
  2542. // If we're in Microsoft extensions mode, treat this as end of file.
  2543. if (LangOpts.MicrosoftExt)
  2544. return LexEndOfFile(Result, CurPtr-1);
  2545. // If Microsoft extensions are disabled, this is just random garbage.
  2546. Kind = tok::unknown;
  2547. break;
  2548. case '\n':
  2549. case '\r':
  2550. // If we are inside a preprocessor directive and we see the end of line,
  2551. // we know we are done with the directive, so return an EOD token.
  2552. if (ParsingPreprocessorDirective) {
  2553. // Done parsing the "line".
  2554. ParsingPreprocessorDirective = false;
  2555. // Restore comment saving mode, in case it was disabled for directive.
  2556. if (PP)
  2557. resetExtendedTokenMode();
  2558. // Since we consumed a newline, we are back at the start of a line.
  2559. IsAtStartOfLine = true;
  2560. IsAtPhysicalStartOfLine = true;
  2561. Kind = tok::eod;
  2562. break;
  2563. }
  2564. // No leading whitespace seen so far.
  2565. Result.clearFlag(Token::LeadingSpace);
  2566. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2567. return true; // KeepWhitespaceMode
  2568. // We only saw whitespace, so just try again with this lexer.
  2569. // (We manually eliminate the tail call to avoid recursion.)
  2570. goto LexNextToken;
  2571. case ' ':
  2572. case '\t':
  2573. case '\f':
  2574. case '\v':
  2575. SkipHorizontalWhitespace:
  2576. Result.setFlag(Token::LeadingSpace);
  2577. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  2578. return true; // KeepWhitespaceMode
  2579. SkipIgnoredUnits:
  2580. CurPtr = BufferPtr;
  2581. // If the next token is obviously a // or /* */ comment, skip it efficiently
  2582. // too (without going through the big switch stmt).
  2583. if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
  2584. LangOpts.LineComment &&
  2585. (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
  2586. if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
  2587. return true; // There is a token to return.
  2588. goto SkipIgnoredUnits;
  2589. } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
  2590. if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
  2591. return true; // There is a token to return.
  2592. goto SkipIgnoredUnits;
  2593. } else if (isHorizontalWhitespace(*CurPtr)) {
  2594. goto SkipHorizontalWhitespace;
  2595. }
  2596. // We only saw whitespace, so just try again with this lexer.
  2597. // (We manually eliminate the tail call to avoid recursion.)
  2598. goto LexNextToken;
  2599. // C99 6.4.4.1: Integer Constants.
  2600. // C99 6.4.4.2: Floating Constants.
  2601. case '0': case '1': case '2': case '3': case '4':
  2602. case '5': case '6': case '7': case '8': case '9':
  2603. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2604. MIOpt.ReadToken();
  2605. return LexNumericConstant(Result, CurPtr);
  2606. case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
  2607. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2608. MIOpt.ReadToken();
  2609. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  2610. Char = getCharAndSize(CurPtr, SizeTmp);
  2611. // UTF-16 string literal
  2612. if (Char == '"')
  2613. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2614. tok::utf16_string_literal);
  2615. // UTF-16 character constant
  2616. if (Char == '\'')
  2617. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2618. tok::utf16_char_constant);
  2619. // UTF-16 raw string literal
  2620. if (Char == 'R' && LangOpts.CPlusPlus11 &&
  2621. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2622. return LexRawStringLiteral(Result,
  2623. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2624. SizeTmp2, Result),
  2625. tok::utf16_string_literal);
  2626. if (Char == '8') {
  2627. char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
  2628. // UTF-8 string literal
  2629. if (Char2 == '"')
  2630. return LexStringLiteral(Result,
  2631. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2632. SizeTmp2, Result),
  2633. tok::utf8_string_literal);
  2634. if (Char2 == '\'' && LangOpts.CPlusPlus1z)
  2635. return LexCharConstant(
  2636. Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2637. SizeTmp2, Result),
  2638. tok::utf8_char_constant);
  2639. if (Char2 == 'R' && LangOpts.CPlusPlus11) {
  2640. unsigned SizeTmp3;
  2641. char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
  2642. // UTF-8 raw string literal
  2643. if (Char3 == '"') {
  2644. return LexRawStringLiteral(Result,
  2645. ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2646. SizeTmp2, Result),
  2647. SizeTmp3, Result),
  2648. tok::utf8_string_literal);
  2649. }
  2650. }
  2651. }
  2652. }
  2653. // treat u like the start of an identifier.
  2654. return LexIdentifier(Result, CurPtr);
  2655. case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
  2656. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2657. MIOpt.ReadToken();
  2658. if (LangOpts.CPlusPlus11 || LangOpts.C11) {
  2659. Char = getCharAndSize(CurPtr, SizeTmp);
  2660. // UTF-32 string literal
  2661. if (Char == '"')
  2662. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2663. tok::utf32_string_literal);
  2664. // UTF-32 character constant
  2665. if (Char == '\'')
  2666. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2667. tok::utf32_char_constant);
  2668. // UTF-32 raw string literal
  2669. if (Char == 'R' && LangOpts.CPlusPlus11 &&
  2670. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2671. return LexRawStringLiteral(Result,
  2672. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2673. SizeTmp2, Result),
  2674. tok::utf32_string_literal);
  2675. }
  2676. // treat U like the start of an identifier.
  2677. return LexIdentifier(Result, CurPtr);
  2678. case 'R': // Identifier or C++0x raw string literal
  2679. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2680. MIOpt.ReadToken();
  2681. if (LangOpts.CPlusPlus11) {
  2682. Char = getCharAndSize(CurPtr, SizeTmp);
  2683. if (Char == '"')
  2684. return LexRawStringLiteral(Result,
  2685. ConsumeChar(CurPtr, SizeTmp, Result),
  2686. tok::string_literal);
  2687. }
  2688. // treat R like the start of an identifier.
  2689. return LexIdentifier(Result, CurPtr);
  2690. case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
  2691. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2692. MIOpt.ReadToken();
  2693. Char = getCharAndSize(CurPtr, SizeTmp);
  2694. // Wide string literal.
  2695. if (Char == '"')
  2696. return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2697. tok::wide_string_literal);
  2698. // Wide raw string literal.
  2699. if (LangOpts.CPlusPlus11 && Char == 'R' &&
  2700. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
  2701. return LexRawStringLiteral(Result,
  2702. ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2703. SizeTmp2, Result),
  2704. tok::wide_string_literal);
  2705. // Wide character constant.
  2706. if (Char == '\'')
  2707. return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2708. tok::wide_char_constant);
  2709. // FALL THROUGH, treating L like the start of an identifier.
  2710. // C99 6.4.2: Identifiers.
  2711. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
  2712. case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
  2713. case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
  2714. case 'V': case 'W': case 'X': case 'Y': case 'Z':
  2715. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
  2716. case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
  2717. case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
  2718. case 'v': case 'w': case 'x': case 'y': case 'z':
  2719. case '_':
  2720. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2721. MIOpt.ReadToken();
  2722. return LexIdentifier(Result, CurPtr);
  2723. case '$': // $ in identifiers.
  2724. if (LangOpts.DollarIdents) {
  2725. if (!isLexingRawMode())
  2726. Diag(CurPtr-1, diag::ext_dollar_in_identifier);
  2727. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2728. MIOpt.ReadToken();
  2729. return LexIdentifier(Result, CurPtr);
  2730. }
  2731. Kind = tok::unknown;
  2732. break;
  2733. // C99 6.4.4: Character Constants.
  2734. case '\'':
  2735. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2736. MIOpt.ReadToken();
  2737. return LexCharConstant(Result, CurPtr, tok::char_constant);
  2738. // C99 6.4.5: String Literals.
  2739. case '"':
  2740. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2741. MIOpt.ReadToken();
  2742. return LexStringLiteral(Result, CurPtr, tok::string_literal);
  2743. // C99 6.4.6: Punctuators.
  2744. case '?':
  2745. Kind = tok::question;
  2746. break;
  2747. case '[':
  2748. Kind = tok::l_square;
  2749. break;
  2750. case ']':
  2751. Kind = tok::r_square;
  2752. break;
  2753. case '(':
  2754. Kind = tok::l_paren;
  2755. break;
  2756. case ')':
  2757. Kind = tok::r_paren;
  2758. break;
  2759. case '{':
  2760. Kind = tok::l_brace;
  2761. break;
  2762. case '}':
  2763. Kind = tok::r_brace;
  2764. break;
  2765. case '.':
  2766. Char = getCharAndSize(CurPtr, SizeTmp);
  2767. if (Char >= '0' && Char <= '9') {
  2768. // Notify MIOpt that we read a non-whitespace/non-comment token.
  2769. MIOpt.ReadToken();
  2770. return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
  2771. } else if (LangOpts.CPlusPlus && Char == '*') {
  2772. Kind = tok::periodstar;
  2773. CurPtr += SizeTmp;
  2774. } else if (Char == '.' &&
  2775. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
  2776. Kind = tok::ellipsis;
  2777. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2778. SizeTmp2, Result);
  2779. } else {
  2780. Kind = tok::period;
  2781. }
  2782. break;
  2783. case '&':
  2784. Char = getCharAndSize(CurPtr, SizeTmp);
  2785. if (Char == '&') {
  2786. Kind = tok::ampamp;
  2787. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2788. } else if (Char == '=') {
  2789. Kind = tok::ampequal;
  2790. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2791. } else {
  2792. Kind = tok::amp;
  2793. }
  2794. break;
  2795. case '*':
  2796. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  2797. Kind = tok::starequal;
  2798. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2799. } else {
  2800. Kind = tok::star;
  2801. }
  2802. break;
  2803. case '+':
  2804. Char = getCharAndSize(CurPtr, SizeTmp);
  2805. if (Char == '+') {
  2806. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2807. Kind = tok::plusplus;
  2808. } else if (Char == '=') {
  2809. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2810. Kind = tok::plusequal;
  2811. } else {
  2812. Kind = tok::plus;
  2813. }
  2814. break;
  2815. case '-':
  2816. Char = getCharAndSize(CurPtr, SizeTmp);
  2817. if (Char == '-') { // --
  2818. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2819. Kind = tok::minusminus;
  2820. } else if (Char == '>' && LangOpts.CPlusPlus &&
  2821. getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
  2822. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2823. SizeTmp2, Result);
  2824. Kind = tok::arrowstar;
  2825. } else if (Char == '>') { // ->
  2826. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2827. Kind = tok::arrow;
  2828. } else if (Char == '=') { // -=
  2829. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2830. Kind = tok::minusequal;
  2831. } else {
  2832. Kind = tok::minus;
  2833. }
  2834. break;
  2835. case '~':
  2836. Kind = tok::tilde;
  2837. break;
  2838. case '!':
  2839. if (getCharAndSize(CurPtr, SizeTmp) == '=') {
  2840. Kind = tok::exclaimequal;
  2841. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2842. } else {
  2843. Kind = tok::exclaim;
  2844. }
  2845. break;
  2846. case '/':
  2847. // 6.4.9: Comments
  2848. Char = getCharAndSize(CurPtr, SizeTmp);
  2849. if (Char == '/') { // Line comment.
  2850. // Even if Line comments are disabled (e.g. in C89 mode), we generally
  2851. // want to lex this as a comment. There is one problem with this though,
  2852. // that in one particular corner case, this can change the behavior of the
  2853. // resultant program. For example, In "foo //**/ bar", C89 would lex
  2854. // this as "foo / bar" and langauges with Line comments would lex it as
  2855. // "foo". Check to see if the character after the second slash is a '*'.
  2856. // If so, we will lex that as a "/" instead of the start of a comment.
  2857. // However, we never do this if we are just preprocessing.
  2858. bool TreatAsComment = LangOpts.LineComment &&
  2859. (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
  2860. if (!TreatAsComment)
  2861. if (!(PP && PP->isPreprocessedOutput()))
  2862. TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
  2863. if (TreatAsComment) {
  2864. if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2865. TokAtPhysicalStartOfLine))
  2866. return true; // There is a token to return.
  2867. // It is common for the tokens immediately after a // comment to be
  2868. // whitespace (indentation for the next line). Instead of going through
  2869. // the big switch, handle it efficiently now.
  2870. goto SkipIgnoredUnits;
  2871. }
  2872. }
  2873. if (Char == '*') { // /**/ comment.
  2874. if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
  2875. TokAtPhysicalStartOfLine))
  2876. return true; // There is a token to return.
  2877. // We only saw whitespace, so just try again with this lexer.
  2878. // (We manually eliminate the tail call to avoid recursion.)
  2879. goto LexNextToken;
  2880. }
  2881. if (Char == '=') {
  2882. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2883. Kind = tok::slashequal;
  2884. } else {
  2885. Kind = tok::slash;
  2886. }
  2887. break;
  2888. case '%':
  2889. Char = getCharAndSize(CurPtr, SizeTmp);
  2890. if (Char == '=') {
  2891. Kind = tok::percentequal;
  2892. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2893. } else if (LangOpts.Digraphs && Char == '>') {
  2894. Kind = tok::r_brace; // '%>' -> '}'
  2895. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2896. } else if (LangOpts.Digraphs && Char == ':') {
  2897. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2898. Char = getCharAndSize(CurPtr, SizeTmp);
  2899. if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
  2900. Kind = tok::hashhash; // '%:%:' -> '##'
  2901. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2902. SizeTmp2, Result);
  2903. } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
  2904. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2905. if (!isLexingRawMode())
  2906. Diag(BufferPtr, diag::ext_charize_microsoft);
  2907. Kind = tok::hashat;
  2908. } else { // '%:' -> '#'
  2909. // We parsed a # character. If this occurs at the start of the line,
  2910. // it's actually the start of a preprocessing directive. Callback to
  2911. // the preprocessor to handle it.
  2912. // TODO: -fpreprocessed mode??
  2913. if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
  2914. goto HandleDirective;
  2915. Kind = tok::hash;
  2916. }
  2917. } else {
  2918. Kind = tok::percent;
  2919. }
  2920. break;
  2921. case '<':
  2922. Char = getCharAndSize(CurPtr, SizeTmp);
  2923. if (ParsingFilename) {
  2924. return LexAngledStringLiteral(Result, CurPtr);
  2925. } else if (Char == '<') {
  2926. char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
  2927. if (After == '=') {
  2928. Kind = tok::lesslessequal;
  2929. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2930. SizeTmp2, Result);
  2931. } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
  2932. // If this is actually a '<<<<<<<' version control conflict marker,
  2933. // recognize it as such and recover nicely.
  2934. goto LexNextToken;
  2935. } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
  2936. // If this is '<<<<' and we're in a Perforce-style conflict marker,
  2937. // ignore it.
  2938. goto LexNextToken;
  2939. } else if (LangOpts.CUDA && After == '<') {
  2940. Kind = tok::lesslessless;
  2941. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2942. SizeTmp2, Result);
  2943. } else {
  2944. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2945. Kind = tok::lessless;
  2946. }
  2947. } else if (Char == '=') {
  2948. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2949. Kind = tok::lessequal;
  2950. } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
  2951. if (LangOpts.CPlusPlus11 &&
  2952. getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
  2953. // C++0x [lex.pptoken]p3:
  2954. // Otherwise, if the next three characters are <:: and the subsequent
  2955. // character is neither : nor >, the < is treated as a preprocessor
  2956. // token by itself and not as the first character of the alternative
  2957. // token <:.
  2958. unsigned SizeTmp3;
  2959. char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
  2960. if (After != ':' && After != '>') {
  2961. Kind = tok::less;
  2962. if (!isLexingRawMode())
  2963. Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
  2964. break;
  2965. }
  2966. }
  2967. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2968. Kind = tok::l_square;
  2969. } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
  2970. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2971. Kind = tok::l_brace;
  2972. } else {
  2973. Kind = tok::less;
  2974. }
  2975. break;
  2976. case '>':
  2977. Char = getCharAndSize(CurPtr, SizeTmp);
  2978. if (Char == '=') {
  2979. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  2980. Kind = tok::greaterequal;
  2981. } else if (Char == '>') {
  2982. char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
  2983. if (After == '=') {
  2984. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2985. SizeTmp2, Result);
  2986. Kind = tok::greatergreaterequal;
  2987. } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
  2988. // If this is actually a '>>>>' conflict marker, recognize it as such
  2989. // and recover nicely.
  2990. goto LexNextToken;
  2991. } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
  2992. // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
  2993. goto LexNextToken;
  2994. } else if (LangOpts.CUDA && After == '>') {
  2995. Kind = tok::greatergreatergreater;
  2996. CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
  2997. SizeTmp2, Result);
  2998. } else {
  2999. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3000. Kind = tok::greatergreater;
  3001. }
  3002. } else {
  3003. Kind = tok::greater;
  3004. }
  3005. break;
  3006. case '^':
  3007. Char = getCharAndSize(CurPtr, SizeTmp);
  3008. if (Char == '=') {
  3009. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3010. Kind = tok::caretequal;
  3011. } else {
  3012. Kind = tok::caret;
  3013. }
  3014. break;
  3015. case '|':
  3016. Char = getCharAndSize(CurPtr, SizeTmp);
  3017. if (Char == '=') {
  3018. Kind = tok::pipeequal;
  3019. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3020. } else if (Char == '|') {
  3021. // If this is '|||||||' and we're in a conflict marker, ignore it.
  3022. if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
  3023. goto LexNextToken;
  3024. Kind = tok::pipepipe;
  3025. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3026. } else {
  3027. Kind = tok::pipe;
  3028. }
  3029. break;
  3030. case ':':
  3031. Char = getCharAndSize(CurPtr, SizeTmp);
  3032. if (LangOpts.Digraphs && Char == '>') {
  3033. Kind = tok::r_square; // ':>' -> ']'
  3034. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3035. } else if (LangOpts.CPlusPlus && Char == ':') {
  3036. Kind = tok::coloncolon;
  3037. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3038. } else {
  3039. Kind = tok::colon;
  3040. }
  3041. break;
  3042. case ';':
  3043. Kind = tok::semi;
  3044. break;
  3045. case '=':
  3046. Char = getCharAndSize(CurPtr, SizeTmp);
  3047. if (Char == '=') {
  3048. // If this is '====' and we're in a conflict marker, ignore it.
  3049. if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
  3050. goto LexNextToken;
  3051. Kind = tok::equalequal;
  3052. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3053. } else {
  3054. Kind = tok::equal;
  3055. }
  3056. break;
  3057. case ',':
  3058. Kind = tok::comma;
  3059. break;
  3060. case '#':
  3061. Char = getCharAndSize(CurPtr, SizeTmp);
  3062. if (Char == '#') {
  3063. Kind = tok::hashhash;
  3064. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3065. } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
  3066. Kind = tok::hashat;
  3067. if (!isLexingRawMode())
  3068. Diag(BufferPtr, diag::ext_charize_microsoft);
  3069. CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
  3070. } else {
  3071. // We parsed a # character. If this occurs at the start of the line,
  3072. // it's actually the start of a preprocessing directive. Callback to
  3073. // the preprocessor to handle it.
  3074. // TODO: -fpreprocessed mode??
  3075. if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
  3076. goto HandleDirective;
  3077. Kind = tok::hash;
  3078. }
  3079. break;
  3080. case '@':
  3081. // Objective C support.
  3082. if (CurPtr[-1] == '@' && LangOpts.ObjC1)
  3083. Kind = tok::at;
  3084. else
  3085. Kind = tok::unknown;
  3086. break;
  3087. // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
  3088. case '\\':
  3089. if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
  3090. if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
  3091. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  3092. return true; // KeepWhitespaceMode
  3093. // We only saw whitespace, so just try again with this lexer.
  3094. // (We manually eliminate the tail call to avoid recursion.)
  3095. goto LexNextToken;
  3096. }
  3097. return LexUnicode(Result, CodePoint, CurPtr);
  3098. }
  3099. Kind = tok::unknown;
  3100. break;
  3101. default: {
  3102. if (isASCII(Char)) {
  3103. Kind = tok::unknown;
  3104. break;
  3105. }
  3106. UTF32 CodePoint;
  3107. // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
  3108. // an escaped newline.
  3109. --CurPtr;
  3110. ConversionResult Status =
  3111. llvm::convertUTF8Sequence((const UTF8 **)&CurPtr,
  3112. (const UTF8 *)BufferEnd,
  3113. &CodePoint,
  3114. strictConversion);
  3115. if (Status == conversionOK) {
  3116. if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
  3117. if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
  3118. return true; // KeepWhitespaceMode
  3119. // We only saw whitespace, so just try again with this lexer.
  3120. // (We manually eliminate the tail call to avoid recursion.)
  3121. goto LexNextToken;
  3122. }
  3123. return LexUnicode(Result, CodePoint, CurPtr);
  3124. }
  3125. if (isLexingRawMode() || ParsingPreprocessorDirective ||
  3126. PP->isPreprocessedOutput()) {
  3127. ++CurPtr;
  3128. Kind = tok::unknown;
  3129. break;
  3130. }
  3131. // Non-ASCII characters tend to creep into source code unintentionally.
  3132. // Instead of letting the parser complain about the unknown token,
  3133. // just diagnose the invalid UTF-8, then drop the character.
  3134. Diag(CurPtr, diag::err_invalid_utf8);
  3135. BufferPtr = CurPtr+1;
  3136. // We're pretending the character didn't exist, so just try again with
  3137. // this lexer.
  3138. // (We manually eliminate the tail call to avoid recursion.)
  3139. goto LexNextToken;
  3140. }
  3141. }
  3142. // Notify MIOpt that we read a non-whitespace/non-comment token.
  3143. MIOpt.ReadToken();
  3144. // Update the location of token as well as BufferPtr.
  3145. FormTokenWithChars(Result, CurPtr, Kind);
  3146. return true;
  3147. HandleDirective:
  3148. // We parsed a # character and it's the start of a preprocessing directive.
  3149. FormTokenWithChars(Result, CurPtr, tok::hash);
  3150. PP->HandleDirective(Result);
  3151. if (PP->hadModuleLoaderFatalFailure()) {
  3152. // With a fatal failure in the module loader, we abort parsing.
  3153. assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
  3154. return true;
  3155. }
  3156. // We parsed the directive; lex a token with the new state.
  3157. return false;
  3158. }