CommentLexer.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. #include "clang/AST/CommentLexer.h"
  2. #include "clang/AST/CommentCommandTraits.h"
  3. #include "clang/AST/CommentDiagnostic.h"
  4. #include "clang/Basic/CharInfo.h"
  5. #include "llvm/ADT/StringExtras.h"
  6. #include "llvm/ADT/StringSwitch.h"
  7. #include "llvm/Support/ConvertUTF.h"
  8. #include "llvm/Support/ErrorHandling.h"
  9. namespace clang {
  10. namespace comments {
  11. void Token::dump(const Lexer &L, const SourceManager &SM) const {
  12. llvm::errs() << "comments::Token Kind=" << Kind << " ";
  13. Loc.dump(SM);
  14. llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
  15. }
  16. static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
  17. return isLetter(C);
  18. }
  19. static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
  20. return isDigit(C);
  21. }
  22. static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
  23. return isHexDigit(C);
  24. }
  25. static inline StringRef convertCodePointToUTF8(
  26. llvm::BumpPtrAllocator &Allocator,
  27. unsigned CodePoint) {
  28. char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
  29. char *ResolvedPtr = Resolved;
  30. if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
  31. return StringRef(Resolved, ResolvedPtr - Resolved);
  32. else
  33. return StringRef();
  34. }
  35. namespace {
  36. #include "clang/AST/CommentHTMLTags.inc"
  37. #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
  38. } // unnamed namespace
  39. StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
  40. // Fast path, first check a few most widely used named character references.
  41. return llvm::StringSwitch<StringRef>(Name)
  42. .Case("amp", "&")
  43. .Case("lt", "<")
  44. .Case("gt", ">")
  45. .Case("quot", "\"")
  46. .Case("apos", "\'")
  47. // Slow path.
  48. .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
  49. }
  50. StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
  51. unsigned CodePoint = 0;
  52. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  53. assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
  54. CodePoint *= 10;
  55. CodePoint += Name[i] - '0';
  56. }
  57. return convertCodePointToUTF8(Allocator, CodePoint);
  58. }
  59. StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
  60. unsigned CodePoint = 0;
  61. for (unsigned i = 0, e = Name.size(); i != e; ++i) {
  62. CodePoint *= 16;
  63. const char C = Name[i];
  64. assert(isHTMLHexCharacterReferenceCharacter(C));
  65. CodePoint += llvm::hexDigitValue(C);
  66. }
  67. return convertCodePointToUTF8(Allocator, CodePoint);
  68. }
  69. void Lexer::skipLineStartingDecorations() {
  70. // This function should be called only for C comments
  71. assert(CommentState == LCS_InsideCComment);
  72. if (BufferPtr == CommentEnd)
  73. return;
  74. switch (*BufferPtr) {
  75. case ' ':
  76. case '\t':
  77. case '\f':
  78. case '\v': {
  79. const char *NewBufferPtr = BufferPtr;
  80. NewBufferPtr++;
  81. if (NewBufferPtr == CommentEnd)
  82. return;
  83. char C = *NewBufferPtr;
  84. while (isHorizontalWhitespace(C)) {
  85. NewBufferPtr++;
  86. if (NewBufferPtr == CommentEnd)
  87. return;
  88. C = *NewBufferPtr;
  89. }
  90. if (C == '*')
  91. BufferPtr = NewBufferPtr + 1;
  92. break;
  93. }
  94. case '*':
  95. BufferPtr++;
  96. break;
  97. }
  98. }
  99. namespace {
  100. /// Returns pointer to the first newline character in the string.
  101. const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
  102. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  103. if (isVerticalWhitespace(*BufferPtr))
  104. return BufferPtr;
  105. }
  106. return BufferEnd;
  107. }
  108. const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
  109. if (BufferPtr == BufferEnd)
  110. return BufferPtr;
  111. if (*BufferPtr == '\n')
  112. BufferPtr++;
  113. else {
  114. assert(*BufferPtr == '\r');
  115. BufferPtr++;
  116. if (BufferPtr != BufferEnd && *BufferPtr == '\n')
  117. BufferPtr++;
  118. }
  119. return BufferPtr;
  120. }
  121. const char *skipNamedCharacterReference(const char *BufferPtr,
  122. const char *BufferEnd) {
  123. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  124. if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
  125. return BufferPtr;
  126. }
  127. return BufferEnd;
  128. }
  129. const char *skipDecimalCharacterReference(const char *BufferPtr,
  130. const char *BufferEnd) {
  131. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  132. if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
  133. return BufferPtr;
  134. }
  135. return BufferEnd;
  136. }
  137. const char *skipHexCharacterReference(const char *BufferPtr,
  138. const char *BufferEnd) {
  139. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  140. if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
  141. return BufferPtr;
  142. }
  143. return BufferEnd;
  144. }
  145. bool isHTMLIdentifierStartingCharacter(char C) {
  146. return isLetter(C);
  147. }
  148. bool isHTMLIdentifierCharacter(char C) {
  149. return isAlphanumeric(C);
  150. }
  151. const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
  152. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  153. if (!isHTMLIdentifierCharacter(*BufferPtr))
  154. return BufferPtr;
  155. }
  156. return BufferEnd;
  157. }
  158. /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
  159. /// string allowed.
  160. ///
  161. /// Returns pointer to closing quote.
  162. const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
  163. {
  164. const char Quote = *BufferPtr;
  165. assert(Quote == '\"' || Quote == '\'');
  166. BufferPtr++;
  167. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  168. const char C = *BufferPtr;
  169. if (C == Quote && BufferPtr[-1] != '\\')
  170. return BufferPtr;
  171. }
  172. return BufferEnd;
  173. }
  174. const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
  175. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  176. if (!isWhitespace(*BufferPtr))
  177. return BufferPtr;
  178. }
  179. return BufferEnd;
  180. }
  181. bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
  182. return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
  183. }
  184. bool isCommandNameStartCharacter(char C) {
  185. return isLetter(C);
  186. }
  187. bool isCommandNameCharacter(char C) {
  188. return isAlphanumeric(C);
  189. }
  190. const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
  191. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  192. if (!isCommandNameCharacter(*BufferPtr))
  193. return BufferPtr;
  194. }
  195. return BufferEnd;
  196. }
  197. /// Return the one past end pointer for BCPL comments.
  198. /// Handles newlines escaped with backslash or trigraph for backslahs.
  199. const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  200. const char *CurPtr = BufferPtr;
  201. while (CurPtr != BufferEnd) {
  202. while (!isVerticalWhitespace(*CurPtr)) {
  203. CurPtr++;
  204. if (CurPtr == BufferEnd)
  205. return BufferEnd;
  206. }
  207. // We found a newline, check if it is escaped.
  208. const char *EscapePtr = CurPtr - 1;
  209. while(isHorizontalWhitespace(*EscapePtr))
  210. EscapePtr--;
  211. if (*EscapePtr == '\\' ||
  212. (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
  213. EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
  214. // We found an escaped newline.
  215. CurPtr = skipNewline(CurPtr, BufferEnd);
  216. } else
  217. return CurPtr; // Not an escaped newline.
  218. }
  219. return BufferEnd;
  220. }
  221. /// Return the one past end pointer for C comments.
  222. /// Very dumb, does not handle escaped newlines or trigraphs.
  223. const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
  224. for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
  225. if (*BufferPtr == '*') {
  226. assert(BufferPtr + 1 != BufferEnd);
  227. if (*(BufferPtr + 1) == '/')
  228. return BufferPtr;
  229. }
  230. }
  231. llvm_unreachable("buffer end hit before '*/' was seen");
  232. }
  233. } // unnamed namespace
  234. void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
  235. tok::TokenKind Kind) {
  236. const unsigned TokLen = TokEnd - BufferPtr;
  237. Result.setLocation(getSourceLocation(BufferPtr));
  238. Result.setKind(Kind);
  239. Result.setLength(TokLen);
  240. #ifndef NDEBUG
  241. Result.TextPtr = "<UNSET>";
  242. Result.IntVal = 7;
  243. #endif
  244. BufferPtr = TokEnd;
  245. }
  246. void Lexer::lexCommentText(Token &T) {
  247. assert(CommentState == LCS_InsideBCPLComment ||
  248. CommentState == LCS_InsideCComment);
  249. switch (State) {
  250. case LS_Normal:
  251. break;
  252. case LS_VerbatimBlockFirstLine:
  253. lexVerbatimBlockFirstLine(T);
  254. return;
  255. case LS_VerbatimBlockBody:
  256. lexVerbatimBlockBody(T);
  257. return;
  258. case LS_VerbatimLineText:
  259. lexVerbatimLineText(T);
  260. return;
  261. case LS_HTMLStartTag:
  262. lexHTMLStartTag(T);
  263. return;
  264. case LS_HTMLEndTag:
  265. lexHTMLEndTag(T);
  266. return;
  267. }
  268. assert(State == LS_Normal);
  269. const char *TokenPtr = BufferPtr;
  270. assert(TokenPtr < CommentEnd);
  271. while (TokenPtr != CommentEnd) {
  272. switch(*TokenPtr) {
  273. case '\\':
  274. case '@': {
  275. // Commands that start with a backslash and commands that start with
  276. // 'at' have equivalent semantics. But we keep information about the
  277. // exact syntax in AST for comments.
  278. tok::TokenKind CommandKind =
  279. (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
  280. TokenPtr++;
  281. if (TokenPtr == CommentEnd) {
  282. formTextToken(T, TokenPtr);
  283. return;
  284. }
  285. char C = *TokenPtr;
  286. switch (C) {
  287. default:
  288. break;
  289. case '\\': case '@': case '&': case '$':
  290. case '#': case '<': case '>': case '%':
  291. case '\"': case '.': case ':':
  292. // This is one of \\ \@ \& \$ etc escape sequences.
  293. TokenPtr++;
  294. if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
  295. // This is the \:: escape sequence.
  296. TokenPtr++;
  297. }
  298. StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
  299. formTokenWithChars(T, TokenPtr, tok::text);
  300. T.setText(UnescapedText);
  301. return;
  302. }
  303. // Don't make zero-length commands.
  304. if (!isCommandNameStartCharacter(*TokenPtr)) {
  305. formTextToken(T, TokenPtr);
  306. return;
  307. }
  308. TokenPtr = skipCommandName(TokenPtr, CommentEnd);
  309. unsigned Length = TokenPtr - (BufferPtr + 1);
  310. // Hardcoded support for lexing LaTeX formula commands
  311. // \f$ \f[ \f] \f{ \f} as a single command.
  312. if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
  313. C = *TokenPtr;
  314. if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
  315. TokenPtr++;
  316. Length++;
  317. }
  318. }
  319. StringRef CommandName(BufferPtr + 1, Length);
  320. const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
  321. if (!Info) {
  322. if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
  323. StringRef CorrectedName = Info->Name;
  324. SourceLocation Loc = getSourceLocation(BufferPtr);
  325. SourceRange CommandRange(Loc.getLocWithOffset(1),
  326. getSourceLocation(TokenPtr));
  327. Diag(Loc, diag::warn_correct_comment_command_name)
  328. << CommandName << CorrectedName
  329. << FixItHint::CreateReplacement(CommandRange, CorrectedName);
  330. } else {
  331. formTokenWithChars(T, TokenPtr, tok::unknown_command);
  332. T.setUnknownCommandName(CommandName);
  333. Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
  334. return;
  335. }
  336. }
  337. if (Info->IsVerbatimBlockCommand) {
  338. setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
  339. return;
  340. }
  341. if (Info->IsVerbatimLineCommand) {
  342. setupAndLexVerbatimLine(T, TokenPtr, Info);
  343. return;
  344. }
  345. formTokenWithChars(T, TokenPtr, CommandKind);
  346. T.setCommandID(Info->getID());
  347. return;
  348. }
  349. case '&':
  350. lexHTMLCharacterReference(T);
  351. return;
  352. case '<': {
  353. TokenPtr++;
  354. if (TokenPtr == CommentEnd) {
  355. formTextToken(T, TokenPtr);
  356. return;
  357. }
  358. const char C = *TokenPtr;
  359. if (isHTMLIdentifierStartingCharacter(C))
  360. setupAndLexHTMLStartTag(T);
  361. else if (C == '/')
  362. setupAndLexHTMLEndTag(T);
  363. else
  364. formTextToken(T, TokenPtr);
  365. return;
  366. }
  367. case '\n':
  368. case '\r':
  369. TokenPtr = skipNewline(TokenPtr, CommentEnd);
  370. formTokenWithChars(T, TokenPtr, tok::newline);
  371. if (CommentState == LCS_InsideCComment)
  372. skipLineStartingDecorations();
  373. return;
  374. default: {
  375. size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
  376. find_first_of("\n\r\\@&<");
  377. if (End != StringRef::npos)
  378. TokenPtr += End;
  379. else
  380. TokenPtr = CommentEnd;
  381. formTextToken(T, TokenPtr);
  382. return;
  383. }
  384. }
  385. }
  386. }
  387. void Lexer::setupAndLexVerbatimBlock(Token &T,
  388. const char *TextBegin,
  389. char Marker, const CommandInfo *Info) {
  390. assert(Info->IsVerbatimBlockCommand);
  391. VerbatimBlockEndCommandName.clear();
  392. VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
  393. VerbatimBlockEndCommandName.append(Info->EndCommandName);
  394. formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
  395. T.setVerbatimBlockID(Info->getID());
  396. // If there is a newline following the verbatim opening command, skip the
  397. // newline so that we don't create an tok::verbatim_block_line with empty
  398. // text content.
  399. if (BufferPtr != CommentEnd &&
  400. isVerticalWhitespace(*BufferPtr)) {
  401. BufferPtr = skipNewline(BufferPtr, CommentEnd);
  402. State = LS_VerbatimBlockBody;
  403. return;
  404. }
  405. State = LS_VerbatimBlockFirstLine;
  406. }
  407. void Lexer::lexVerbatimBlockFirstLine(Token &T) {
  408. again:
  409. assert(BufferPtr < CommentEnd);
  410. // FIXME: It would be better to scan the text once, finding either the block
  411. // end command or newline.
  412. //
  413. // Extract current line.
  414. const char *Newline = findNewline(BufferPtr, CommentEnd);
  415. StringRef Line(BufferPtr, Newline - BufferPtr);
  416. // Look for end command in current line.
  417. size_t Pos = Line.find(VerbatimBlockEndCommandName);
  418. const char *TextEnd;
  419. const char *NextLine;
  420. if (Pos == StringRef::npos) {
  421. // Current line is completely verbatim.
  422. TextEnd = Newline;
  423. NextLine = skipNewline(Newline, CommentEnd);
  424. } else if (Pos == 0) {
  425. // Current line contains just an end command.
  426. const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
  427. StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
  428. formTokenWithChars(T, End, tok::verbatim_block_end);
  429. T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
  430. State = LS_Normal;
  431. return;
  432. } else {
  433. // There is some text, followed by end command. Extract text first.
  434. TextEnd = BufferPtr + Pos;
  435. NextLine = TextEnd;
  436. // If there is only whitespace before end command, skip whitespace.
  437. if (isWhitespace(BufferPtr, TextEnd)) {
  438. BufferPtr = TextEnd;
  439. goto again;
  440. }
  441. }
  442. StringRef Text(BufferPtr, TextEnd - BufferPtr);
  443. formTokenWithChars(T, NextLine, tok::verbatim_block_line);
  444. T.setVerbatimBlockText(Text);
  445. State = LS_VerbatimBlockBody;
  446. }
  447. void Lexer::lexVerbatimBlockBody(Token &T) {
  448. assert(State == LS_VerbatimBlockBody);
  449. if (CommentState == LCS_InsideCComment)
  450. skipLineStartingDecorations();
  451. if (BufferPtr == CommentEnd) {
  452. formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
  453. T.setVerbatimBlockText("");
  454. return;
  455. }
  456. lexVerbatimBlockFirstLine(T);
  457. }
  458. void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  459. const CommandInfo *Info) {
  460. assert(Info->IsVerbatimLineCommand);
  461. formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
  462. T.setVerbatimLineID(Info->getID());
  463. State = LS_VerbatimLineText;
  464. }
  465. void Lexer::lexVerbatimLineText(Token &T) {
  466. assert(State == LS_VerbatimLineText);
  467. // Extract current line.
  468. const char *Newline = findNewline(BufferPtr, CommentEnd);
  469. StringRef Text(BufferPtr, Newline - BufferPtr);
  470. formTokenWithChars(T, Newline, tok::verbatim_line_text);
  471. T.setVerbatimLineText(Text);
  472. State = LS_Normal;
  473. }
  474. void Lexer::lexHTMLCharacterReference(Token &T) {
  475. const char *TokenPtr = BufferPtr;
  476. assert(*TokenPtr == '&');
  477. TokenPtr++;
  478. if (TokenPtr == CommentEnd) {
  479. formTextToken(T, TokenPtr);
  480. return;
  481. }
  482. const char *NamePtr;
  483. bool isNamed = false;
  484. bool isDecimal = false;
  485. char C = *TokenPtr;
  486. if (isHTMLNamedCharacterReferenceCharacter(C)) {
  487. NamePtr = TokenPtr;
  488. TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
  489. isNamed = true;
  490. } else if (C == '#') {
  491. TokenPtr++;
  492. if (TokenPtr == CommentEnd) {
  493. formTextToken(T, TokenPtr);
  494. return;
  495. }
  496. C = *TokenPtr;
  497. if (isHTMLDecimalCharacterReferenceCharacter(C)) {
  498. NamePtr = TokenPtr;
  499. TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
  500. isDecimal = true;
  501. } else if (C == 'x' || C == 'X') {
  502. TokenPtr++;
  503. NamePtr = TokenPtr;
  504. TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
  505. } else {
  506. formTextToken(T, TokenPtr);
  507. return;
  508. }
  509. } else {
  510. formTextToken(T, TokenPtr);
  511. return;
  512. }
  513. if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
  514. *TokenPtr != ';') {
  515. formTextToken(T, TokenPtr);
  516. return;
  517. }
  518. StringRef Name(NamePtr, TokenPtr - NamePtr);
  519. TokenPtr++; // Skip semicolon.
  520. StringRef Resolved;
  521. if (isNamed)
  522. Resolved = resolveHTMLNamedCharacterReference(Name);
  523. else if (isDecimal)
  524. Resolved = resolveHTMLDecimalCharacterReference(Name);
  525. else
  526. Resolved = resolveHTMLHexCharacterReference(Name);
  527. if (Resolved.empty()) {
  528. formTextToken(T, TokenPtr);
  529. return;
  530. }
  531. formTokenWithChars(T, TokenPtr, tok::text);
  532. T.setText(Resolved);
  533. return;
  534. }
  535. void Lexer::setupAndLexHTMLStartTag(Token &T) {
  536. assert(BufferPtr[0] == '<' &&
  537. isHTMLIdentifierStartingCharacter(BufferPtr[1]));
  538. const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
  539. StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
  540. if (!isHTMLTagName(Name)) {
  541. formTextToken(T, TagNameEnd);
  542. return;
  543. }
  544. formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
  545. T.setHTMLTagStartName(Name);
  546. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  547. const char C = *BufferPtr;
  548. if (BufferPtr != CommentEnd &&
  549. (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
  550. State = LS_HTMLStartTag;
  551. }
  552. void Lexer::lexHTMLStartTag(Token &T) {
  553. assert(State == LS_HTMLStartTag);
  554. const char *TokenPtr = BufferPtr;
  555. char C = *TokenPtr;
  556. if (isHTMLIdentifierCharacter(C)) {
  557. TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
  558. StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
  559. formTokenWithChars(T, TokenPtr, tok::html_ident);
  560. T.setHTMLIdent(Ident);
  561. } else {
  562. switch (C) {
  563. case '=':
  564. TokenPtr++;
  565. formTokenWithChars(T, TokenPtr, tok::html_equals);
  566. break;
  567. case '\"':
  568. case '\'': {
  569. const char *OpenQuote = TokenPtr;
  570. TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
  571. const char *ClosingQuote = TokenPtr;
  572. if (TokenPtr != CommentEnd) // Skip closing quote.
  573. TokenPtr++;
  574. formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
  575. T.setHTMLQuotedString(StringRef(OpenQuote + 1,
  576. ClosingQuote - (OpenQuote + 1)));
  577. break;
  578. }
  579. case '>':
  580. TokenPtr++;
  581. formTokenWithChars(T, TokenPtr, tok::html_greater);
  582. State = LS_Normal;
  583. return;
  584. case '/':
  585. TokenPtr++;
  586. if (TokenPtr != CommentEnd && *TokenPtr == '>') {
  587. TokenPtr++;
  588. formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
  589. } else
  590. formTextToken(T, TokenPtr);
  591. State = LS_Normal;
  592. return;
  593. }
  594. }
  595. // Now look ahead and return to normal state if we don't see any HTML tokens
  596. // ahead.
  597. BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
  598. if (BufferPtr == CommentEnd) {
  599. State = LS_Normal;
  600. return;
  601. }
  602. C = *BufferPtr;
  603. if (!isHTMLIdentifierStartingCharacter(C) &&
  604. C != '=' && C != '\"' && C != '\'' && C != '>') {
  605. State = LS_Normal;
  606. return;
  607. }
  608. }
  609. void Lexer::setupAndLexHTMLEndTag(Token &T) {
  610. assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
  611. const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
  612. const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
  613. StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
  614. if (!isHTMLTagName(Name)) {
  615. formTextToken(T, TagNameEnd);
  616. return;
  617. }
  618. const char *End = skipWhitespace(TagNameEnd, CommentEnd);
  619. formTokenWithChars(T, End, tok::html_end_tag);
  620. T.setHTMLTagEndName(Name);
  621. if (BufferPtr != CommentEnd && *BufferPtr == '>')
  622. State = LS_HTMLEndTag;
  623. }
  624. void Lexer::lexHTMLEndTag(Token &T) {
  625. assert(BufferPtr != CommentEnd && *BufferPtr == '>');
  626. formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
  627. State = LS_Normal;
  628. }
  629. Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
  630. const CommandTraits &Traits,
  631. SourceLocation FileLoc,
  632. const char *BufferStart, const char *BufferEnd):
  633. Allocator(Allocator), Diags(Diags), Traits(Traits),
  634. BufferStart(BufferStart), BufferEnd(BufferEnd),
  635. FileLoc(FileLoc), BufferPtr(BufferStart),
  636. CommentState(LCS_BeforeComment), State(LS_Normal) {
  637. }
  638. void Lexer::lex(Token &T) {
  639. again:
  640. switch (CommentState) {
  641. case LCS_BeforeComment:
  642. if (BufferPtr == BufferEnd) {
  643. formTokenWithChars(T, BufferPtr, tok::eof);
  644. return;
  645. }
  646. assert(*BufferPtr == '/');
  647. BufferPtr++; // Skip first slash.
  648. switch(*BufferPtr) {
  649. case '/': { // BCPL comment.
  650. BufferPtr++; // Skip second slash.
  651. if (BufferPtr != BufferEnd) {
  652. // Skip Doxygen magic marker, if it is present.
  653. // It might be missing because of a typo //< or /*<, or because we
  654. // merged this non-Doxygen comment into a bunch of Doxygen comments
  655. // around it: /** ... */ /* ... */ /** ... */
  656. const char C = *BufferPtr;
  657. if (C == '/' || C == '!')
  658. BufferPtr++;
  659. }
  660. // Skip less-than symbol that marks trailing comments.
  661. // Skip it even if the comment is not a Doxygen one, because //< and /*<
  662. // are frequent typos.
  663. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  664. BufferPtr++;
  665. CommentState = LCS_InsideBCPLComment;
  666. if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
  667. State = LS_Normal;
  668. CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
  669. goto again;
  670. }
  671. case '*': { // C comment.
  672. BufferPtr++; // Skip star.
  673. // Skip Doxygen magic marker.
  674. const char C = *BufferPtr;
  675. if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
  676. BufferPtr++;
  677. // Skip less-than symbol that marks trailing comments.
  678. if (BufferPtr != BufferEnd && *BufferPtr == '<')
  679. BufferPtr++;
  680. CommentState = LCS_InsideCComment;
  681. State = LS_Normal;
  682. CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
  683. goto again;
  684. }
  685. default:
  686. llvm_unreachable("second character of comment should be '/' or '*'");
  687. }
  688. case LCS_BetweenComments: {
  689. // Consecutive comments are extracted only if there is only whitespace
  690. // between them. So we can search for the start of the next comment.
  691. const char *EndWhitespace = BufferPtr;
  692. while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
  693. EndWhitespace++;
  694. // Turn any whitespace between comments (and there is only whitespace
  695. // between them -- guaranteed by comment extraction) into a newline. We
  696. // have two newlines between C comments in total (first one was synthesized
  697. // after a comment).
  698. formTokenWithChars(T, EndWhitespace, tok::newline);
  699. CommentState = LCS_BeforeComment;
  700. break;
  701. }
  702. case LCS_InsideBCPLComment:
  703. case LCS_InsideCComment:
  704. if (BufferPtr != CommentEnd) {
  705. lexCommentText(T);
  706. break;
  707. } else {
  708. // Skip C comment closing sequence.
  709. if (CommentState == LCS_InsideCComment) {
  710. assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
  711. BufferPtr += 2;
  712. assert(BufferPtr <= BufferEnd);
  713. // Synthenize newline just after the C comment, regardless if there is
  714. // actually a newline.
  715. formTokenWithChars(T, BufferPtr, tok::newline);
  716. CommentState = LCS_BetweenComments;
  717. break;
  718. } else {
  719. // Don't synthesized a newline after BCPL comment.
  720. CommentState = LCS_BetweenComments;
  721. goto again;
  722. }
  723. }
  724. }
  725. }
  726. StringRef Lexer::getSpelling(const Token &Tok,
  727. const SourceManager &SourceMgr,
  728. bool *Invalid) const {
  729. SourceLocation Loc = Tok.getLocation();
  730. std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
  731. bool InvalidTemp = false;
  732. StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
  733. if (InvalidTemp) {
  734. *Invalid = true;
  735. return StringRef();
  736. }
  737. const char *Begin = File.data() + LocInfo.second;
  738. return StringRef(Begin, Tok.getLength());
  739. }
  740. } // end namespace comments
  741. } // end namespace clang