LiteralSupport.cpp 58 KB


  1. //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the NumericLiteralParser, CharLiteralParser, and
  11. // StringLiteralParser interfaces.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #include "clang/Lex/LiteralSupport.h"
  15. #include "clang/Basic/CharInfo.h"
  16. #include "clang/Basic/TargetInfo.h"
  17. #include "clang/Lex/LexDiagnostic.h"
  18. #include "clang/Lex/Preprocessor.h"
  19. #include "llvm/ADT/StringExtras.h"
  20. #include "llvm/Support/ConvertUTF.h"
  21. #include "llvm/Support/ErrorHandling.h"
  22. using namespace clang;
  23. static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  24. switch (kind) {
  25. default: llvm_unreachable("Unknown token type!");
  26. case tok::char_constant:
  27. case tok::string_literal:
  28. case tok::utf8_char_constant:
  29. case tok::utf8_string_literal:
  30. return Target.getCharWidth();
  31. case tok::wide_char_constant:
  32. case tok::wide_string_literal:
  33. return Target.getWCharWidth();
  34. case tok::utf16_char_constant:
  35. case tok::utf16_string_literal:
  36. return Target.getChar16Width();
  37. case tok::utf32_char_constant:
  38. case tok::utf32_string_literal:
  39. return Target.getChar32Width();
  40. }
  41. }
  42. static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
  43. FullSourceLoc TokLoc,
  44. const char *TokBegin,
  45. const char *TokRangeBegin,
  46. const char *TokRangeEnd) {
  47. SourceLocation Begin =
  48. Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  49. TokLoc.getManager(), Features);
  50. SourceLocation End =
  51. Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
  52. TokLoc.getManager(), Features);
  53. return CharSourceRange::getCharRange(Begin, End);
  54. }
  55. /// \brief Produce a diagnostic highlighting some portion of a literal.
  56. ///
  57. /// Emits the diagnostic \p DiagID, highlighting the range of characters from
  58. /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
  59. /// a substring of a spelling buffer for the token beginning at \p TokBegin.
  60. static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
  61. const LangOptions &Features, FullSourceLoc TokLoc,
  62. const char *TokBegin, const char *TokRangeBegin,
  63. const char *TokRangeEnd, unsigned DiagID) {
  64. SourceLocation Begin =
  65. Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
  66. TokLoc.getManager(), Features);
  67. return Diags->Report(Begin, DiagID) <<
  68. MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
  69. }
  70. /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  71. /// either a character or a string literal.
  72. static unsigned ProcessCharEscape(const char *ThisTokBegin,
  73. const char *&ThisTokBuf,
  74. const char *ThisTokEnd, bool &HadError,
  75. FullSourceLoc Loc, unsigned CharWidth,
  76. DiagnosticsEngine *Diags,
  77. const LangOptions &Features) {
  78. const char *EscapeBegin = ThisTokBuf;
  79. // Skip the '\' char.
  80. ++ThisTokBuf;
  81. // We know that this character can't be off the end of the buffer, because
  82. // that would have been \", which would not have been the end of string.
  83. unsigned ResultChar = *ThisTokBuf++;
  84. switch (ResultChar) {
  85. // These map to themselves.
  86. case '\\': case '\'': case '"': case '?': break;
  87. // These have fixed mappings.
  88. case 'a':
  89. // TODO: K&R: the meaning of '\\a' is different in traditional C
  90. ResultChar = 7;
  91. break;
  92. case 'b':
  93. ResultChar = 8;
  94. break;
  95. case 'e':
  96. if (Diags)
  97. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  98. diag::ext_nonstandard_escape) << "e";
  99. ResultChar = 27;
  100. break;
  101. case 'E':
  102. if (Diags)
  103. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  104. diag::ext_nonstandard_escape) << "E";
  105. ResultChar = 27;
  106. break;
  107. case 'f':
  108. ResultChar = 12;
  109. break;
  110. case 'n':
  111. ResultChar = 10;
  112. break;
  113. case 'r':
  114. ResultChar = 13;
  115. break;
  116. case 't':
  117. ResultChar = 9;
  118. break;
  119. case 'v':
  120. ResultChar = 11;
  121. break;
  122. case 'x': { // Hex escape.
  123. ResultChar = 0;
  124. if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
  125. if (Diags)
  126. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  127. diag::err_hex_escape_no_digits) << "x";
  128. HadError = 1;
  129. break;
  130. }
  131. // Hex escapes are a maximal series of hex digits.
  132. bool Overflow = false;
  133. for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  134. int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
  135. if (CharVal == -1) break;
  136. // About to shift out a digit?
  137. if (ResultChar & 0xF0000000)
  138. Overflow = true;
  139. ResultChar <<= 4;
  140. ResultChar |= CharVal;
  141. }
  142. // See if any bits will be truncated when evaluated as a character.
  143. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  144. Overflow = true;
  145. ResultChar &= ~0U >> (32-CharWidth);
  146. }
  147. // Check for overflow.
  148. if (Overflow && Diags) // Too many digits to fit in
  149. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  150. diag::err_hex_escape_too_large);
  151. break;
  152. }
  153. case '0': case '1': case '2': case '3':
  154. case '4': case '5': case '6': case '7': {
  155. // Octal escapes.
  156. --ThisTokBuf;
  157. ResultChar = 0;
  158. // Octal escapes are a series of octal digits with maximum length 3.
  159. // "\0123" is a two digit sequence equal to "\012" "3".
  160. unsigned NumDigits = 0;
  161. do {
  162. ResultChar <<= 3;
  163. ResultChar |= *ThisTokBuf++ - '0';
  164. ++NumDigits;
  165. } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
  166. ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
  167. // Check for overflow. Reject '\777', but not L'\777'.
  168. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
  169. if (Diags)
  170. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  171. diag::err_octal_escape_too_large);
  172. ResultChar &= ~0U >> (32-CharWidth);
  173. }
  174. break;
  175. }
  176. // Otherwise, these are not valid escapes.
  177. case '(': case '{': case '[': case '%':
  178. // GCC accepts these as extensions. We warn about them as such though.
  179. if (Diags)
  180. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  181. diag::ext_nonstandard_escape)
  182. << std::string(1, ResultChar);
  183. break;
  184. default:
  185. if (!Diags)
  186. break;
  187. if (isPrintable(ResultChar))
  188. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  189. diag::ext_unknown_escape)
  190. << std::string(1, ResultChar);
  191. else
  192. Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
  193. diag::ext_unknown_escape)
  194. << "x" + llvm::utohexstr(ResultChar);
  195. break;
  196. }
  197. return ResultChar;
  198. }
  199. static void appendCodePoint(unsigned Codepoint,
  200. llvm::SmallVectorImpl<char> &Str) {
  201. char ResultBuf[4];
  202. char *ResultPtr = ResultBuf;
  203. bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
  204. (void)Res;
  205. assert(Res && "Unexpected conversion failure");
  206. Str.append(ResultBuf, ResultPtr);
  207. }
  208. void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
  209. for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
  210. if (*I != '\\') {
  211. Buf.push_back(*I);
  212. continue;
  213. }
  214. ++I;
  215. assert(*I == 'u' || *I == 'U');
  216. unsigned NumHexDigits;
  217. if (*I == 'u')
  218. NumHexDigits = 4;
  219. else
  220. NumHexDigits = 8;
  221. assert(I + NumHexDigits <= E);
  222. uint32_t CodePoint = 0;
  223. for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
  224. unsigned Value = llvm::hexDigitValue(*I);
  225. assert(Value != -1U);
  226. CodePoint <<= 4;
  227. CodePoint += Value;
  228. }
  229. appendCodePoint(CodePoint, Buf);
  230. --I;
  231. }
  232. }
  233. /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
  234. /// return the UTF32.
  235. static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  236. const char *ThisTokEnd,
  237. uint32_t &UcnVal, unsigned short &UcnLen,
  238. FullSourceLoc Loc, DiagnosticsEngine *Diags,
  239. const LangOptions &Features,
  240. bool in_char_string_literal = false) {
  241. const char *UcnBegin = ThisTokBuf;
  242. // Skip the '\u' char's.
  243. ThisTokBuf += 2;
  244. if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
  245. if (Diags)
  246. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  247. diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
  248. return false;
  249. }
  250. UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
  251. unsigned short UcnLenSave = UcnLen;
  252. for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
  253. int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
  254. if (CharVal == -1) break;
  255. UcnVal <<= 4;
  256. UcnVal |= CharVal;
  257. }
  258. // If we didn't consume the proper number of digits, there is a problem.
  259. if (UcnLenSave) {
  260. if (Diags)
  261. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  262. diag::err_ucn_escape_incomplete);
  263. return false;
  264. }
  265. // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
  266. if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
  267. UcnVal > 0x10FFFF) { // maximum legal UTF32 value
  268. if (Diags)
  269. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  270. diag::err_ucn_escape_invalid);
  271. return false;
  272. }
  273. // C++11 allows UCNs that refer to control characters and basic source
  274. // characters inside character and string literals
  275. if (UcnVal < 0xa0 &&
  276. (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
  277. bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
  278. if (Diags) {
  279. char BasicSCSChar = UcnVal;
  280. if (UcnVal >= 0x20 && UcnVal < 0x7f)
  281. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  282. IsError ? diag::err_ucn_escape_basic_scs :
  283. diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
  284. << StringRef(&BasicSCSChar, 1);
  285. else
  286. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  287. IsError ? diag::err_ucn_control_character :
  288. diag::warn_cxx98_compat_literal_ucn_control_character);
  289. }
  290. if (IsError)
  291. return false;
  292. }
  293. if (!Features.CPlusPlus && !Features.C99 && Diags)
  294. Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
  295. diag::warn_ucn_not_valid_in_c89_literal);
  296. return true;
  297. }
  298. /// MeasureUCNEscape - Determine the number of bytes within the resulting string
  299. /// which this UCN will occupy.
  300. static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  301. const char *ThisTokEnd, unsigned CharByteWidth,
  302. const LangOptions &Features, bool &HadError) {
  303. // UTF-32: 4 bytes per escape.
  304. if (CharByteWidth == 4)
  305. return 4;
  306. uint32_t UcnVal = 0;
  307. unsigned short UcnLen = 0;
  308. FullSourceLoc Loc;
  309. if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
  310. UcnLen, Loc, nullptr, Features, true)) {
  311. HadError = true;
  312. return 0;
  313. }
  314. // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
  315. if (CharByteWidth == 2)
  316. return UcnVal <= 0xFFFF ? 2 : 4;
  317. // UTF-8.
  318. if (UcnVal < 0x80)
  319. return 1;
  320. if (UcnVal < 0x800)
  321. return 2;
  322. if (UcnVal < 0x10000)
  323. return 3;
  324. return 4;
  325. }
  326. /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
  327. /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
  328. /// StringLiteralParser. When we decide to implement UCN's for identifiers,
  329. /// we will likely rework our support for UCN's.
  330. static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
  331. const char *ThisTokEnd,
  332. _Out_cap_(4) char *&ResultBuf, bool &HadError,
  333. FullSourceLoc Loc, unsigned CharByteWidth,
  334. DiagnosticsEngine *Diags,
  335. const LangOptions &Features) {
  336. typedef uint32_t UTF32;
  337. UTF32 UcnVal = 0;
  338. unsigned short UcnLen = 0;
  339. if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
  340. Loc, Diags, Features, true)) {
  341. HadError = true;
  342. return;
  343. }
  344. assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
  345. "only character widths of 1, 2, or 4 bytes supported");
  346. (void)UcnLen;
  347. assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
  348. if (CharByteWidth == 4) {
  349. // FIXME: Make the type of the result buffer correct instead of
  350. // using reinterpret_cast.
  351. UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
  352. *ResultPtr = UcnVal;
  353. ResultBuf += 4;
  354. return;
  355. }
  356. if (CharByteWidth == 2) {
  357. // FIXME: Make the type of the result buffer correct instead of
  358. // using reinterpret_cast.
  359. UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
  360. if (UcnVal <= (UTF32)0xFFFF) {
  361. *ResultPtr = UcnVal;
  362. ResultBuf += 2;
  363. return;
  364. }
  365. // Convert to UTF16.
  366. UcnVal -= 0x10000;
  367. *ResultPtr = 0xD800 + (UcnVal >> 10);
  368. *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
  369. ResultBuf += 4;
  370. return;
  371. }
  372. assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
  373. // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
  374. // The conversion below was inspired by:
  375. // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
  376. // First, we determine how many bytes the result will require.
  377. typedef uint8_t UTF8;
  378. unsigned short bytesToWrite = 0;
  379. if (UcnVal < (UTF32)0x80)
  380. bytesToWrite = 1;
  381. else if (UcnVal < (UTF32)0x800)
  382. bytesToWrite = 2;
  383. else if (UcnVal < (UTF32)0x10000)
  384. bytesToWrite = 3;
  385. else
  386. bytesToWrite = 4;
  387. const unsigned byteMask = 0xBF;
  388. const unsigned byteMark = 0x80;
  389. // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
  390. // into the first byte, depending on how many bytes follow.
  391. static const UTF8 firstByteMark[5] = {
  392. 0x00, 0x00, 0xC0, 0xE0, 0xF0
  393. };
  394. // Finally, we write the bytes into ResultBuf.
  395. ResultBuf += bytesToWrite;
  396. switch (bytesToWrite) { // note: everything falls through.
  397. case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  398. case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  399. case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
  400. case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
  401. }
  402. // Update the buffer.
  403. ResultBuf += bytesToWrite;
  404. }
  405. /// integer-constant: [C99 6.4.4.1]
  406. /// decimal-constant integer-suffix
  407. /// octal-constant integer-suffix
  408. /// hexadecimal-constant integer-suffix
  409. /// binary-literal integer-suffix [GNU, C++1y]
  410. /// user-defined-integer-literal: [C++11 lex.ext]
  411. /// decimal-literal ud-suffix
  412. /// octal-literal ud-suffix
  413. /// hexadecimal-literal ud-suffix
  414. /// binary-literal ud-suffix [GNU, C++1y]
  415. /// decimal-constant:
  416. /// nonzero-digit
  417. /// decimal-constant digit
  418. /// octal-constant:
  419. /// 0
  420. /// octal-constant octal-digit
  421. /// hexadecimal-constant:
  422. /// hexadecimal-prefix hexadecimal-digit
  423. /// hexadecimal-constant hexadecimal-digit
  424. /// hexadecimal-prefix: one of
  425. /// 0x 0X
  426. /// binary-literal:
  427. /// 0b binary-digit
  428. /// 0B binary-digit
  429. /// binary-literal binary-digit
  430. /// integer-suffix:
  431. /// unsigned-suffix [long-suffix]
  432. /// unsigned-suffix [long-long-suffix]
  433. /// long-suffix [unsigned-suffix]
  434. /// long-long-suffix [unsigned-sufix]
  435. /// nonzero-digit:
  436. /// 1 2 3 4 5 6 7 8 9
  437. /// octal-digit:
  438. /// 0 1 2 3 4 5 6 7
  439. /// hexadecimal-digit:
  440. /// 0 1 2 3 4 5 6 7 8 9
  441. /// a b c d e f
  442. /// A B C D E F
  443. /// binary-digit:
  444. /// 0
  445. /// 1
  446. /// unsigned-suffix: one of
  447. /// u U
  448. /// long-suffix: one of
  449. /// l L
  450. /// long-long-suffix: one of
  451. /// ll LL
  452. ///
  453. /// floating-constant: [C99 6.4.4.2]
  454. /// TODO: add rules...
  455. ///
  456. NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
  457. SourceLocation TokLoc,
  458. Preprocessor &PP)
  459. : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
  460. // This routine assumes that the range begin/end matches the regex for integer
  461. // and FP constants (specifically, the 'pp-number' regex), and assumes that
  462. // the byte at "*end" is both valid and not part of the regex. Because of
  463. // this, it doesn't have to check for 'overscan' in various places.
  464. assert(!isPreprocessingNumberBody(*ThisTokEnd) || *ThisTokEnd == '.' || *ThisTokEnd == '#' && "didn't maximally munch?"); // HLSL Change - '.' might be a second '.' for a '1.2.x' literal
  465. s = DigitsBegin = ThisTokBegin;
  466. saw_inf = false;
  467. saw_exponent = false;
  468. saw_period = false;
  469. saw_ud_suffix = false;
  470. isLong = false;
  471. isUnsigned = false;
  472. isLongLong = false;
  473. isFloat = false;
  474. isHalf = false; // HLSL Change
  475. isImaginary = false;
  476. MicrosoftInteger = 0;
  477. hadError = false;
  478. if (*s == '0') { // parse radix
  479. ParseNumberStartingWithZero(TokLoc);
  480. if (hadError)
  481. return;
  482. } else { // the first digit is non-zero
  483. radix = 10;
  484. s = SkipDigits(s);
  485. if (s == ThisTokEnd) {
  486. // Done.
  487. } else if (isHexDigit(*s) && !(*s == 'e' || *s == 'E')) {
  488. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
  489. diag::err_invalid_decimal_digit) << StringRef(s, 1);
  490. hadError = true;
  491. return;
  492. } else if (*s == '.') {
  493. checkSeparator(TokLoc, s, CSK_AfterDigits);
  494. s++;
  495. saw_period = true;
  496. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  497. s = SkipDigits(s);
  498. }
  499. if ((*s == 'e' || *s == 'E')) { // exponent
  500. checkSeparator(TokLoc, s, CSK_AfterDigits);
  501. const char *Exponent = s;
  502. s++;
  503. saw_exponent = true;
  504. if (*s == '+' || *s == '-') s++; // sign
  505. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  506. const char *first_non_digit = SkipDigits(s);
  507. if (first_non_digit != s) {
  508. s = first_non_digit;
  509. } else {
  510. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent - ThisTokBegin),
  511. diag::err_exponent_has_no_digits);
  512. hadError = true;
  513. return;
  514. }
  515. }
  516. // HLSL Change Starts
  517. else if (*s == '#') {
  518. const char *InfBegin = s;
  519. if (s[1] == 'I' && s[2] == 'N' && s[3] == 'F') {
  520. saw_inf = true;
  521. if (!saw_period) {
  522. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, InfBegin - ThisTokBegin),
  523. diag::err_invalid_suffix_integer_constant)
  524. << StringRef(InfBegin, ThisTokEnd - InfBegin);
  525. hadError = true;
  526. return;
  527. }
  528. s += 4;
  529. }
  530. }
  531. // HLSL Change Ends
  532. }
  533. SuffixBegin = s;
  534. checkSeparator(TokLoc, s, CSK_AfterDigits);
  535. // Parse the suffix. At this point we can classify whether we have an FP or
  536. // integer constant.
  537. bool isFPConstant = isFloatingLiteral();
  538. const char *ImaginarySuffixLoc = nullptr;
  539. // Loop over all of the characters of the suffix. If we see something bad,
  540. // we break out of the loop.
  541. for (; s != ThisTokEnd; ++s) {
  542. switch (*s) {
  543. case 'f': // FP Suffix for "float"
  544. case 'F':
  545. if (!isFPConstant) break; // Error for integer constant.
  546. if (isFloat || isLong) break; // FF, LF invalid.
  547. isFloat = true;
  548. continue; // Success.
  549. // HLSL Change Starts
  550. // TODO : When we support true half type, these suffixes should be treated differently from f/F
  551. case 'h':
  552. case 'H':
  553. if (!isFPConstant) break;
  554. if (isHalf) break;
  555. isHalf = true;
  556. continue;
  557. // HLSL Change Ends
  558. case 'u':
  559. case 'U':
  560. if (isFPConstant) break; // Error for floating constant.
  561. if (isUnsigned) break; // Cannot be repeated.
  562. isUnsigned = true;
  563. continue; // Success.
  564. case 'l':
  565. case 'L':
  566. if (isLong || isLongLong) break; // Cannot be repeated.
  567. if (isFloat) break; // LF invalid.
  568. // Check for long long. The L's need to be adjacent and the same case.
  569. if (s[1] == s[0]) {
  570. assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
  571. if (isFPConstant) break; // long long invalid for floats.
  572. isLongLong = true;
  573. ++s; // Eat both of them.
  574. } else {
  575. isLong = true;
  576. }
  577. continue; // Success.
  578. case 'i':
  579. case 'I':
  580. if (PP.getLangOpts().MicrosoftExt) {
  581. if (isLong || isLongLong || MicrosoftInteger)
  582. break;
  583. if (!isFPConstant) {
  584. // Allow i8, i16, i32, i64, and i128.
  585. switch (s[1]) {
  586. case '8':
  587. s += 2; // i8 suffix
  588. MicrosoftInteger = 8;
  589. break;
  590. case '1':
  591. if (s[2] == '6') {
  592. s += 3; // i16 suffix
  593. MicrosoftInteger = 16;
  594. } else if (s[2] == '2' && s[3] == '8') {
  595. s += 4; // i128 suffix
  596. MicrosoftInteger = 128;
  597. }
  598. break;
  599. case '3':
  600. if (s[2] == '2') {
  601. s += 3; // i32 suffix
  602. MicrosoftInteger = 32;
  603. }
  604. break;
  605. case '6':
  606. if (s[2] == '4') {
  607. s += 3; // i64 suffix
  608. MicrosoftInteger = 64;
  609. }
  610. break;
  611. default:
  612. break;
  613. }
  614. }
  615. if (MicrosoftInteger) {
  616. assert(s <= ThisTokEnd && "didn't maximally munch?");
  617. break;
  618. }
  619. }
  620. // "i", "if", and "il" are user-defined suffixes in C++1y.
  621. if (*s == 'i' && PP.getLangOpts().CPlusPlus14)
  622. break;
  623. // fall through.
  624. case 'j':
  625. case 'J':
  626. if (isImaginary) break; // Cannot be repeated.
  627. isImaginary = true;
  628. ImaginarySuffixLoc = s;
  629. // HLSL Change Starts.
  630. if (PP.getLangOpts().HLSL) {
  631. // Don't advance; this leaves us with an invalid suffix.
  632. // Great if imaginary literals are implemented at some point, in
  633. // the meantime catches '.#INFI' as an error rather than a suffix
  634. // on an INF literal.
  635. break;
  636. }
  637. // HLSL Change Ends.
  638. continue; // Success.
  639. }
  640. // If we reached here, there was an error or a ud-suffix.
  641. break;
  642. }
  643. if (s != ThisTokEnd) {
  644. // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
  645. expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
  646. if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
  647. // Any suffix pieces we might have parsed are actually part of the
  648. // ud-suffix.
  649. isLong = false;
  650. isUnsigned = false;
  651. isLongLong = false;
  652. isFloat = false;
  653. isImaginary = false;
  654. MicrosoftInteger = 0;
  655. saw_ud_suffix = true;
  656. return;
  657. }
  658. // Report an error if there are any.
  659. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
  660. isFPConstant ? diag::err_invalid_suffix_float_constant :
  661. diag::err_invalid_suffix_integer_constant)
  662. << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
  663. hadError = true;
  664. return;
  665. }
  666. if (isImaginary) {
  667. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc,
  668. ImaginarySuffixLoc - ThisTokBegin),
  669. diag::ext_imaginary_constant);
  670. }
  671. }
  672. /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
  673. /// suffixes as ud-suffixes, because the diagnostic experience is better if we
  674. /// treat it as an invalid suffix.
  675. bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
  676. StringRef Suffix) {
  677. if (!LangOpts.CPlusPlus11 || Suffix.empty())
  678. return false;
  679. // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
  680. if (Suffix[0] == '_')
  681. return true;
  682. // In C++11, there are no library suffixes.
  683. if (!LangOpts.CPlusPlus14)
  684. return false;
  685. // In C++1y, "s", "h", "min", "ms", "us", and "ns" are used in the library.
  686. // Per tweaked N3660, "il", "i", and "if" are also used in the library.
  687. return llvm::StringSwitch<bool>(Suffix)
  688. .Cases("h", "min", "s", true)
  689. .Cases("ms", "us", "ns", true)
  690. .Cases("il", "i", "if", true)
  691. .Default(false);
  692. }
  693. void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
  694. const char *Pos,
  695. CheckSeparatorKind IsAfterDigits) {
  696. if (IsAfterDigits == CSK_AfterDigits) {
  697. if (Pos == ThisTokBegin)
  698. return;
  699. --Pos;
  700. } else if (Pos == ThisTokEnd)
  701. return;
  702. if (isDigitSeparator(*Pos))
  703. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
  704. diag::err_digit_separator_not_between_digits)
  705. << IsAfterDigits;
  706. }
  707. /// ParseNumberStartingWithZero - This method is called when the first character
  708. /// of the number is found to be a zero. This means it is either an octal
  709. /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
  710. /// a floating point number (01239.123e4). Eat the prefix, determining the
  711. /// radix etc.
  712. void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
  713. assert(s[0] == '0' && "Invalid method call");
  714. s++;
  715. int c1 = s[0];
  716. // Handle a hex number like 0x1234.
  717. if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
  718. s++;
  719. assert(s < ThisTokEnd && "didn't maximally munch?");
  720. radix = 16;
  721. DigitsBegin = s;
  722. s = SkipHexDigits(s);
  723. bool noSignificand = (s == DigitsBegin);
  724. if (s == ThisTokEnd) {
  725. // Done.
  726. } else if (*s == '.') {
  727. s++;
  728. saw_period = true;
  729. const char *floatDigitsBegin = s;
  730. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  731. s = SkipHexDigits(s);
  732. noSignificand &= (floatDigitsBegin == s);
  733. }
  734. if (noSignificand) {
  735. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
  736. diag::err_hexconstant_requires_digits);
  737. hadError = true;
  738. return;
  739. }
  740. // A binary exponent can appear with or with a '.'. If dotted, the
  741. // binary exponent is required.
  742. if (*s == 'p' || *s == 'P') {
  743. checkSeparator(TokLoc, s, CSK_AfterDigits);
  744. const char *Exponent = s;
  745. s++;
  746. saw_exponent = true;
  747. if (*s == '+' || *s == '-') s++; // sign
  748. const char *first_non_digit = SkipDigits(s);
  749. if (first_non_digit == s) {
  750. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
  751. diag::err_exponent_has_no_digits);
  752. hadError = true;
  753. return;
  754. }
  755. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  756. s = first_non_digit;
  757. if (!PP.getLangOpts().HexFloats)
  758. PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
  759. } else if (saw_period) {
  760. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  761. diag::err_hexconstant_requires_exponent);
  762. hadError = true;
  763. }
  764. return;
  765. }
  766. // Handle simple binary numbers 0b01010
  767. if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
  768. // 0b101010 is a C++1y / GCC extension.
  769. PP.Diag(TokLoc,
  770. PP.getLangOpts().CPlusPlus14
  771. ? diag::warn_cxx11_compat_binary_literal
  772. : PP.getLangOpts().CPlusPlus
  773. ? diag::ext_binary_literal_cxx14
  774. : diag::ext_binary_literal);
  775. ++s;
  776. assert(s < ThisTokEnd && "didn't maximally munch?");
  777. radix = 2;
  778. DigitsBegin = s;
  779. s = SkipBinaryDigits(s);
  780. if (s == ThisTokEnd) {
  781. // Done.
  782. } else if (isHexDigit(*s)) {
  783. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  784. diag::err_invalid_binary_digit) << StringRef(s, 1);
  785. hadError = true;
  786. }
  787. // Other suffixes will be diagnosed by the caller.
  788. return;
  789. }
  790. // For now, the radix is set to 8. If we discover that we have a
  791. // floating point constant, the radix will change to 10. Octal floating
  792. // point constants are not permitted (only decimal and hexadecimal).
  793. radix = 8;
  794. DigitsBegin = s;
  795. s = SkipOctalDigits(s);
  796. if (s == ThisTokEnd)
  797. return; // Done, simple octal number like 01234
  798. // If we have some other non-octal digit that *is* a decimal digit, see if
  799. // this is part of a floating point number like 094.123 or 09e1.
  800. if (isDigit(*s)) {
  801. const char *EndDecimal = SkipDigits(s);
  802. if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
  803. s = EndDecimal;
  804. radix = 10;
  805. }
  806. }
  807. // If we have a hex digit other than 'e' (which denotes a FP exponent) then
  808. // the code is using an incorrect base.
  809. if (isHexDigit(*s) && *s != 'e' && *s != 'E') {
  810. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
  811. diag::err_invalid_octal_digit) << StringRef(s, 1);
  812. hadError = true;
  813. return;
  814. }
  815. if (*s == '.') {
  816. s++;
  817. radix = 10;
  818. saw_period = true;
  819. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  820. s = SkipDigits(s); // Skip suffix.
  821. }
  822. if (*s == 'e' || *s == 'E') { // exponent
  823. checkSeparator(TokLoc, s, CSK_AfterDigits);
  824. const char *Exponent = s;
  825. s++;
  826. radix = 10;
  827. saw_exponent = true;
  828. if (*s == '+' || *s == '-') s++; // sign
  829. const char *first_non_digit = SkipDigits(s);
  830. if (first_non_digit != s) {
  831. checkSeparator(TokLoc, s, CSK_BeforeDigits);
  832. s = first_non_digit;
  833. } else {
  834. PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
  835. diag::err_exponent_has_no_digits);
  836. hadError = true;
  837. return;
  838. }
  839. }
  840. }
  841. static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
  842. switch (Radix) {
  843. case 2:
  844. return NumDigits <= 64;
  845. case 8:
  846. return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
  847. case 10:
  848. return NumDigits <= 19; // floor(log10(2^64))
  849. case 16:
  850. return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
  851. default:
  852. llvm_unreachable("impossible Radix");
  853. }
  854. }
  855. /// GetIntegerValue - Convert this numeric literal value to an APInt that
  856. /// matches Val's input width. If there is an overflow, set Val to the low bits
  857. /// of the result and return true. Otherwise, return false.
  858. bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
  859. // Fast path: Compute a conservative bound on the maximum number of
  860. // bits per digit in this radix. If we can't possibly overflow a
  861. // uint64 based on that bound then do the simple conversion to
  862. // integer. This avoids the expensive overflow checking below, and
  863. // handles the common cases that matter (small decimal integers and
  864. // hex/octal values which don't overflow).
  865. const unsigned NumDigits = SuffixBegin - DigitsBegin;
  866. if (alwaysFitsInto64Bits(radix, NumDigits)) {
  867. uint64_t N = 0;
  868. for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
  869. if (!isDigitSeparator(*Ptr))
  870. N = N * radix + llvm::hexDigitValue(*Ptr);
  871. // This will truncate the value to Val's input width. Simply check
  872. // for overflow by comparing.
  873. Val = N;
  874. return Val.getZExtValue() != N;
  875. }
  876. Val = 0;
  877. const char *Ptr = DigitsBegin;
  878. llvm::APInt RadixVal(Val.getBitWidth(), radix);
  879. llvm::APInt CharVal(Val.getBitWidth(), 0);
  880. llvm::APInt OldVal = Val;
  881. bool OverflowOccurred = false;
  882. while (Ptr < SuffixBegin) {
  883. if (isDigitSeparator(*Ptr)) {
  884. ++Ptr;
  885. continue;
  886. }
  887. unsigned C = llvm::hexDigitValue(*Ptr++);
  888. // If this letter is out of bound for this radix, reject it.
  889. assert(C < radix && "NumericLiteralParser ctor should have rejected this");
  890. CharVal = C;
  891. // Add the digit to the value in the appropriate radix. If adding in digits
  892. // made the value smaller, then this overflowed.
  893. OldVal = Val;
  894. // Multiply by radix, did overflow occur on the multiply?
  895. Val *= RadixVal;
  896. OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
  897. // Add value, did overflow occur on the value?
  898. // (a + b) ult b <=> overflow
  899. Val += CharVal;
  900. OverflowOccurred |= Val.ult(CharVal);
  901. }
  902. return OverflowOccurred;
  903. }
  904. llvm::APFloat::opStatus
  905. NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
  906. using llvm::APFloat;
  907. unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
  908. llvm::SmallString<16> Buffer;
  909. StringRef Str(ThisTokBegin, n);
  910. if (Str.find('\'') != StringRef::npos) {
  911. Buffer.reserve(n);
  912. std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
  913. &isDigitSeparator);
  914. Str = Buffer;
  915. }
  916. return Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
  917. }
  918. /// \verbatim
  919. /// user-defined-character-literal: [C++11 lex.ext]
  920. /// character-literal ud-suffix
  921. /// ud-suffix:
  922. /// identifier
  923. /// character-literal: [C++11 lex.ccon]
  924. /// ' c-char-sequence '
  925. /// u' c-char-sequence '
  926. /// U' c-char-sequence '
  927. /// L' c-char-sequence '
  928. /// c-char-sequence:
  929. /// c-char
  930. /// c-char-sequence c-char
  931. /// c-char:
  932. /// any member of the source character set except the single-quote ',
  933. /// backslash \, or new-line character
  934. /// escape-sequence
  935. /// universal-character-name
  936. /// escape-sequence:
  937. /// simple-escape-sequence
  938. /// octal-escape-sequence
  939. /// hexadecimal-escape-sequence
  940. /// simple-escape-sequence:
  941. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  942. /// octal-escape-sequence:
  943. /// \ octal-digit
  944. /// \ octal-digit octal-digit
  945. /// \ octal-digit octal-digit octal-digit
  946. /// hexadecimal-escape-sequence:
  947. /// \x hexadecimal-digit
  948. /// hexadecimal-escape-sequence hexadecimal-digit
  949. /// universal-character-name: [C++11 lex.charset]
  950. /// \u hex-quad
  951. /// \U hex-quad hex-quad
  952. /// hex-quad:
  953. /// hex-digit hex-digit hex-digit hex-digit
  954. /// \endverbatim
  955. ///
  956. CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
  957. SourceLocation Loc, Preprocessor &PP,
  958. tok::TokenKind kind) {
  959. // At this point we know that the character matches the regex "(L|u|U)?'.*'".
  960. HadError = false;
  961. Kind = kind;
  962. const char *TokBegin = begin;
  963. // Skip over wide character determinant.
  964. if (Kind != tok::char_constant)
  965. ++begin;
  966. if (Kind == tok::utf8_char_constant)
  967. ++begin;
  968. // Skip over the entry quote.
  969. assert(begin[0] == '\'' && "Invalid token lexed");
  970. ++begin;
  971. // Remove an optional ud-suffix.
  972. if (end[-1] != '\'') {
  973. const char *UDSuffixEnd = end;
  974. do {
  975. --end;
  976. } while (end[-1] != '\'');
  977. // FIXME: Don't bother with this if !tok.hasUCN().
  978. expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
  979. UDSuffixOffset = end - TokBegin;
  980. }
  981. // Trim the ending quote.
  982. assert(end != begin && "Invalid token lexed");
  983. --end;
  984. // FIXME: The "Value" is an uint64_t so we can handle char literals of
  985. // up to 64-bits.
  986. // FIXME: This extensively assumes that 'char' is 8-bits.
  987. assert(PP.getTargetInfo().getCharWidth() == 8 &&
  988. "Assumes char is 8 bits");
  989. assert(PP.getTargetInfo().getIntWidth() <= 64 &&
  990. (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
  991. "Assumes sizeof(int) on target is <= 64 and a multiple of char");
  992. assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
  993. "Assumes sizeof(wchar) on target is <= 64");
  994. SmallVector<uint32_t, 4> codepoint_buffer;
  995. codepoint_buffer.resize(end - begin);
  996. uint32_t *buffer_begin = &codepoint_buffer.front();
  997. uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
  998. // Unicode escapes representing characters that cannot be correctly
  999. // represented in a single code unit are disallowed in character literals
  1000. // by this implementation.
  1001. uint32_t largest_character_for_kind;
  1002. if (tok::wide_char_constant == Kind) {
  1003. largest_character_for_kind =
  1004. 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
  1005. } else if (tok::utf8_char_constant == Kind) {
  1006. largest_character_for_kind = 0x7F;
  1007. } else if (tok::utf16_char_constant == Kind) {
  1008. largest_character_for_kind = 0xFFFF;
  1009. } else if (tok::utf32_char_constant == Kind) {
  1010. largest_character_for_kind = 0x10FFFF;
  1011. } else {
  1012. largest_character_for_kind = 0x7Fu;
  1013. }
  1014. while (begin != end) {
  1015. // Is this a span of non-escape characters?
  1016. if (begin[0] != '\\') {
  1017. char const *start = begin;
  1018. do {
  1019. ++begin;
  1020. } while (begin != end && *begin != '\\');
  1021. char const *tmp_in_start = start;
  1022. uint32_t *tmp_out_start = buffer_begin;
  1023. ConversionResult res =
  1024. ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
  1025. reinterpret_cast<UTF8 const *>(begin),
  1026. &buffer_begin, buffer_end, strictConversion);
  1027. if (res != conversionOK) {
  1028. // If we see bad encoding for unprefixed character literals, warn and
  1029. // simply copy the byte values, for compatibility with gcc and
  1030. // older versions of clang.
  1031. bool NoErrorOnBadEncoding = isAscii();
  1032. unsigned Msg = diag::err_bad_character_encoding;
  1033. if (NoErrorOnBadEncoding)
  1034. Msg = diag::warn_bad_character_encoding;
  1035. PP.Diag(Loc, Msg);
  1036. if (NoErrorOnBadEncoding) {
  1037. start = tmp_in_start;
  1038. buffer_begin = tmp_out_start;
  1039. for (; start != begin; ++start, ++buffer_begin)
  1040. *buffer_begin = static_cast<uint8_t>(*start);
  1041. } else {
  1042. HadError = true;
  1043. }
  1044. } else {
  1045. for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
  1046. if (*tmp_out_start > largest_character_for_kind) {
  1047. HadError = true;
  1048. PP.Diag(Loc, diag::err_character_too_large);
  1049. }
  1050. }
  1051. }
  1052. continue;
  1053. }
  1054. // Is this a Universal Character Name escape?
  1055. if (begin[1] == 'u' || begin[1] == 'U') {
  1056. unsigned short UcnLen = 0;
  1057. if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
  1058. FullSourceLoc(Loc, PP.getSourceManager()),
  1059. &PP.getDiagnostics(), PP.getLangOpts(), true)) {
  1060. HadError = true;
  1061. } else if (*buffer_begin > largest_character_for_kind) {
  1062. HadError = true;
  1063. PP.Diag(Loc, diag::err_character_too_large);
  1064. }
  1065. ++buffer_begin;
  1066. continue;
  1067. }
  1068. unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
  1069. uint64_t result =
  1070. ProcessCharEscape(TokBegin, begin, end, HadError,
  1071. FullSourceLoc(Loc,PP.getSourceManager()),
  1072. CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
  1073. *buffer_begin++ = result;
  1074. }
  1075. unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
  1076. if (NumCharsSoFar > 1) {
  1077. if (isWide())
  1078. PP.Diag(Loc, diag::warn_extraneous_char_constant);
  1079. else if (isAscii() && NumCharsSoFar == 4)
  1080. PP.Diag(Loc, diag::ext_four_char_character_literal);
  1081. else if (isAscii())
  1082. PP.Diag(Loc, diag::ext_multichar_character_literal);
  1083. else
  1084. PP.Diag(Loc, diag::err_multichar_utf_character_literal);
  1085. IsMultiChar = true;
  1086. } else {
  1087. IsMultiChar = false;
  1088. }
  1089. llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
  1090. // Narrow character literals act as though their value is concatenated
  1091. // in this implementation, but warn on overflow.
  1092. bool multi_char_too_long = false;
  1093. if (isAscii() && isMultiChar()) {
  1094. LitVal = 0;
  1095. for (size_t i = 0; i < NumCharsSoFar; ++i) {
  1096. // check for enough leading zeros to shift into
  1097. multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
  1098. LitVal <<= 8;
  1099. LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
  1100. }
  1101. } else if (NumCharsSoFar > 0) {
  1102. // otherwise just take the last character
  1103. LitVal = buffer_begin[-1];
  1104. }
  1105. if (!HadError && multi_char_too_long) {
  1106. PP.Diag(Loc, diag::warn_char_constant_too_large);
  1107. }
  1108. // Transfer the value from APInt to uint64_t
  1109. Value = LitVal.getZExtValue();
  1110. // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
  1111. // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
  1112. // character constants are not sign extended in the this implementation:
  1113. // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
  1114. if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
  1115. PP.getLangOpts().CharIsSigned)
  1116. Value = (signed char)Value;
  1117. }
  1118. /// \verbatim
  1119. /// string-literal: [C++0x lex.string]
  1120. /// encoding-prefix " [s-char-sequence] "
  1121. /// encoding-prefix R raw-string
  1122. /// encoding-prefix:
  1123. /// u8
  1124. /// u
  1125. /// U
  1126. /// L
  1127. /// s-char-sequence:
  1128. /// s-char
  1129. /// s-char-sequence s-char
  1130. /// s-char:
  1131. /// any member of the source character set except the double-quote ",
  1132. /// backslash \, or new-line character
  1133. /// escape-sequence
  1134. /// universal-character-name
  1135. /// raw-string:
  1136. /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
  1137. /// r-char-sequence:
  1138. /// r-char
  1139. /// r-char-sequence r-char
  1140. /// r-char:
  1141. /// any member of the source character set, except a right parenthesis )
  1142. /// followed by the initial d-char-sequence (which may be empty)
  1143. /// followed by a double quote ".
  1144. /// d-char-sequence:
  1145. /// d-char
  1146. /// d-char-sequence d-char
  1147. /// d-char:
  1148. /// any member of the basic source character set except:
  1149. /// space, the left parenthesis (, the right parenthesis ),
  1150. /// the backslash \, and the control characters representing horizontal
  1151. /// tab, vertical tab, form feed, and newline.
  1152. /// escape-sequence: [C++0x lex.ccon]
  1153. /// simple-escape-sequence
  1154. /// octal-escape-sequence
  1155. /// hexadecimal-escape-sequence
  1156. /// simple-escape-sequence:
  1157. /// one of \' \" \? \\ \a \b \f \n \r \t \v
  1158. /// octal-escape-sequence:
  1159. /// \ octal-digit
  1160. /// \ octal-digit octal-digit
  1161. /// \ octal-digit octal-digit octal-digit
  1162. /// hexadecimal-escape-sequence:
  1163. /// \x hexadecimal-digit
  1164. /// hexadecimal-escape-sequence hexadecimal-digit
  1165. /// universal-character-name:
  1166. /// \u hex-quad
  1167. /// \U hex-quad hex-quad
  1168. /// hex-quad:
  1169. /// hex-digit hex-digit hex-digit hex-digit
  1170. /// \endverbatim
  1171. ///
  1172. StringLiteralParser::
  1173. StringLiteralParser(ArrayRef<Token> StringToks,
  1174. Preprocessor &PP, bool Complain)
  1175. : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
  1176. Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
  1177. MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
  1178. ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
  1179. init(StringToks);
  1180. }
  1181. void StringLiteralParser::init(ArrayRef<Token> StringToks){
  1182. // The literal token may have come from an invalid source location (e.g. due
  1183. // to a PCH error), in which case the token length will be 0.
  1184. if (StringToks.empty() || StringToks[0].getLength() < 2)
  1185. return DiagnoseLexingError(SourceLocation());
  1186. // Scan all of the string portions, remember the max individual token length,
  1187. // computing a bound on the concatenated string length, and see whether any
  1188. // piece is a wide-string. If any of the string portions is a wide-string
  1189. // literal, the result is a wide-string literal [C99 6.4.5p4].
  1190. assert(!StringToks.empty() && "expected at least one token");
  1191. MaxTokenLength = StringToks[0].getLength();
  1192. assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
  1193. SizeBound = StringToks[0].getLength()-2; // -2 for "".
  1194. Kind = StringToks[0].getKind();
  1195. hadError = false;
  1196. // Implement Translation Phase #6: concatenation of string literals
  1197. /// (C99 5.1.1.2p1). The common case is only one string fragment.
  1198. for (unsigned i = 1; i != StringToks.size(); ++i) {
  1199. if (StringToks[i].getLength() < 2)
  1200. return DiagnoseLexingError(StringToks[i].getLocation());
  1201. // The string could be shorter than this if it needs cleaning, but this is a
  1202. // reasonable bound, which is all we need.
  1203. assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
  1204. SizeBound += StringToks[i].getLength()-2; // -2 for "".
  1205. // Remember maximum string piece length.
  1206. if (StringToks[i].getLength() > MaxTokenLength)
  1207. MaxTokenLength = StringToks[i].getLength();
  1208. // Remember if we see any wide or utf-8/16/32 strings.
  1209. // Also check for illegal concatenations.
  1210. if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
  1211. if (isAscii()) {
  1212. Kind = StringToks[i].getKind();
  1213. } else {
  1214. if (Diags)
  1215. Diags->Report(StringToks[i].getLocation(),
  1216. diag::err_unsupported_string_concat);
  1217. hadError = true;
  1218. }
  1219. }
  1220. }
  1221. // Include space for the null terminator.
  1222. ++SizeBound;
  1223. // TODO: K&R warning: "traditional C rejects string constant concatenation"
  1224. // Get the width in bytes of char/wchar_t/char16_t/char32_t
  1225. CharByteWidth = getCharWidth(Kind, Target);
  1226. assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
  1227. CharByteWidth /= 8;
  1228. // The output buffer size needs to be large enough to hold wide characters.
  1229. // This is a worst-case assumption which basically corresponds to L"" "long".
  1230. SizeBound *= CharByteWidth;
  1231. // Size the temporary buffer to hold the result string data.
  1232. ResultBuf.resize(SizeBound);
  1233. // Likewise, but for each string piece.
  1234. SmallString<512> TokenBuf;
  1235. TokenBuf.resize(MaxTokenLength);
  1236. // Loop over all the strings, getting their spelling, and expanding them to
  1237. // wide strings as appropriate.
  1238. ResultPtr = &ResultBuf[0]; // Next byte to fill in.
  1239. Pascal = false;
  1240. SourceLocation UDSuffixTokLoc;
  1241. for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
  1242. const char *ThisTokBuf = &TokenBuf[0];
  1243. // Get the spelling of the token, which eliminates trigraphs, etc. We know
  1244. // that ThisTokBuf points to a buffer that is big enough for the whole token
  1245. // and 'spelled' tokens can only shrink.
  1246. bool StringInvalid = false;
  1247. unsigned ThisTokLen =
  1248. Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
  1249. &StringInvalid);
  1250. if (StringInvalid)
  1251. return DiagnoseLexingError(StringToks[i].getLocation());
  1252. const char *ThisTokBegin = ThisTokBuf;
  1253. const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
  1254. // Remove an optional ud-suffix.
  1255. if (ThisTokEnd[-1] != '"') {
  1256. const char *UDSuffixEnd = ThisTokEnd;
  1257. do {
  1258. --ThisTokEnd;
  1259. } while (ThisTokEnd[-1] != '"');
  1260. StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
  1261. if (UDSuffixBuf.empty()) {
  1262. if (StringToks[i].hasUCN())
  1263. expandUCNs(UDSuffixBuf, UDSuffix);
  1264. else
  1265. UDSuffixBuf.assign(UDSuffix);
  1266. UDSuffixToken = i;
  1267. UDSuffixOffset = ThisTokEnd - ThisTokBuf;
  1268. UDSuffixTokLoc = StringToks[i].getLocation();
  1269. } else {
  1270. SmallString<32> ExpandedUDSuffix;
  1271. if (StringToks[i].hasUCN()) {
  1272. expandUCNs(ExpandedUDSuffix, UDSuffix);
  1273. UDSuffix = ExpandedUDSuffix;
  1274. }
  1275. // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
  1276. // result of a concatenation involving at least one user-defined-string-
  1277. // literal, all the participating user-defined-string-literals shall
  1278. // have the same ud-suffix.
  1279. if (UDSuffixBuf != UDSuffix) {
  1280. if (Diags) {
  1281. SourceLocation TokLoc = StringToks[i].getLocation();
  1282. Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
  1283. << UDSuffixBuf << UDSuffix
  1284. << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
  1285. << SourceRange(TokLoc, TokLoc);
  1286. }
  1287. hadError = true;
  1288. }
  1289. }
  1290. }
  1291. // Strip the end quote.
  1292. --ThisTokEnd;
  1293. // TODO: Input character set mapping support.
  1294. // Skip marker for wide or unicode strings.
  1295. if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
  1296. ++ThisTokBuf;
  1297. // Skip 8 of u8 marker for utf8 strings.
  1298. if (ThisTokBuf[0] == '8')
  1299. ++ThisTokBuf;
  1300. }
  1301. // Check for raw string
  1302. if (ThisTokBuf[0] == 'R') {
  1303. ThisTokBuf += 2; // skip R"
  1304. const char *Prefix = ThisTokBuf;
  1305. while (ThisTokBuf[0] != '(')
  1306. ++ThisTokBuf;
  1307. ++ThisTokBuf; // skip '('
  1308. // Remove same number of characters from the end
  1309. ThisTokEnd -= ThisTokBuf - Prefix;
  1310. assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
  1311. // Copy the string over
  1312. if (CopyStringFragment(StringToks[i], ThisTokBegin,
  1313. StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
  1314. hadError = true;
  1315. } else {
  1316. if (ThisTokBuf[0] != '"') {
  1317. // The file may have come from PCH and then changed after loading the
  1318. // PCH; Fail gracefully.
  1319. return DiagnoseLexingError(StringToks[i].getLocation());
  1320. }
  1321. ++ThisTokBuf; // skip "
  1322. // Check if this is a pascal string
  1323. if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
  1324. ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
  1325. // If the \p sequence is found in the first token, we have a pascal string
  1326. // Otherwise, if we already have a pascal string, ignore the first \p
  1327. if (i == 0) {
  1328. ++ThisTokBuf;
  1329. Pascal = true;
  1330. } else if (Pascal)
  1331. ThisTokBuf += 2;
  1332. }
  1333. while (ThisTokBuf != ThisTokEnd) {
  1334. // Is this a span of non-escape characters?
  1335. if (ThisTokBuf[0] != '\\') {
  1336. const char *InStart = ThisTokBuf;
  1337. do {
  1338. ++ThisTokBuf;
  1339. } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
  1340. // Copy the character span over.
  1341. if (CopyStringFragment(StringToks[i], ThisTokBegin,
  1342. StringRef(InStart, ThisTokBuf - InStart)))
  1343. hadError = true;
  1344. continue;
  1345. }
  1346. // Is this a Universal Character Name escape?
  1347. if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
  1348. EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
  1349. ResultPtr, hadError,
  1350. FullSourceLoc(StringToks[i].getLocation(), SM),
  1351. CharByteWidth, Diags, Features);
  1352. continue;
  1353. }
  1354. // Otherwise, this is a non-UCN escape character. Process it.
  1355. unsigned ResultChar =
  1356. ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
  1357. FullSourceLoc(StringToks[i].getLocation(), SM),
  1358. CharByteWidth*8, Diags, Features);
  1359. if (CharByteWidth == 4) {
  1360. // FIXME: Make the type of the result buffer correct instead of
  1361. // using reinterpret_cast.
  1362. UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
  1363. *ResultWidePtr = ResultChar;
  1364. ResultPtr += 4;
  1365. } else if (CharByteWidth == 2) {
  1366. // FIXME: Make the type of the result buffer correct instead of
  1367. // using reinterpret_cast.
  1368. UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
  1369. *ResultWidePtr = ResultChar & 0xFFFF;
  1370. ResultPtr += 2;
  1371. } else {
  1372. assert(CharByteWidth == 1 && "Unexpected char width");
  1373. *ResultPtr++ = ResultChar & 0xFF;
  1374. }
  1375. }
  1376. }
  1377. }
  1378. if (Pascal) {
  1379. if (CharByteWidth == 4) {
  1380. // FIXME: Make the type of the result buffer correct instead of
  1381. // using reinterpret_cast.
  1382. UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
  1383. ResultWidePtr[0] = GetNumStringChars() - 1;
  1384. } else if (CharByteWidth == 2) {
  1385. // FIXME: Make the type of the result buffer correct instead of
  1386. // using reinterpret_cast.
  1387. UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
  1388. ResultWidePtr[0] = GetNumStringChars() - 1;
  1389. } else {
  1390. assert(CharByteWidth == 1 && "Unexpected char width");
  1391. ResultBuf[0] = GetNumStringChars() - 1;
  1392. }
  1393. // Verify that pascal strings aren't too large.
  1394. if (GetStringLength() > 256) {
  1395. if (Diags)
  1396. Diags->Report(StringToks.front().getLocation(),
  1397. diag::err_pascal_string_too_long)
  1398. << SourceRange(StringToks.front().getLocation(),
  1399. StringToks.back().getLocation());
  1400. hadError = true;
  1401. return;
  1402. }
  1403. } else if (Diags) {
  1404. // Complain if this string literal has too many characters.
  1405. unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
  1406. if (GetNumStringChars() > MaxChars)
  1407. Diags->Report(StringToks.front().getLocation(),
  1408. diag::ext_string_too_long)
  1409. << GetNumStringChars() << MaxChars
  1410. << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
  1411. << SourceRange(StringToks.front().getLocation(),
  1412. StringToks.back().getLocation());
  1413. }
  1414. }
  1415. static const char *resyncUTF8(const char *Err, const char *End) {
  1416. if (Err == End)
  1417. return End;
  1418. End = Err + std::min<unsigned>(getNumBytesForUTF8(*Err), End-Err);
  1419. while (++Err != End && (*Err & 0xC0) == 0x80)
  1420. ;
  1421. return Err;
  1422. }
  1423. /// \brief This function copies from Fragment, which is a sequence of bytes
  1424. /// within Tok's contents (which begin at TokBegin) into ResultPtr.
  1425. /// Performs widening for multi-byte characters.
  1426. bool StringLiteralParser::CopyStringFragment(const Token &Tok,
  1427. const char *TokBegin,
  1428. StringRef Fragment) {
  1429. const UTF8 *ErrorPtrTmp;
  1430. if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
  1431. return false;
  1432. // If we see bad encoding for unprefixed string literals, warn and
  1433. // simply copy the byte values, for compatibility with gcc and older
  1434. // versions of clang.
  1435. bool NoErrorOnBadEncoding = isAscii();
  1436. if (NoErrorOnBadEncoding) {
  1437. memcpy(ResultPtr, Fragment.data(), Fragment.size());
  1438. ResultPtr += Fragment.size();
  1439. }
  1440. if (Diags) {
  1441. const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
  1442. FullSourceLoc SourceLoc(Tok.getLocation(), SM);
  1443. const DiagnosticBuilder &Builder =
  1444. Diag(Diags, Features, SourceLoc, TokBegin,
  1445. ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
  1446. NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
  1447. : diag::err_bad_string_encoding);
  1448. const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
  1449. StringRef NextFragment(NextStart, Fragment.end()-NextStart);
  1450. // Decode into a dummy buffer.
  1451. SmallString<512> Dummy;
  1452. Dummy.reserve(Fragment.size() * CharByteWidth);
  1453. char *Ptr = Dummy.data();
  1454. while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
  1455. const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
  1456. NextStart = resyncUTF8(ErrorPtr, Fragment.end());
  1457. Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
  1458. ErrorPtr, NextStart);
  1459. NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
  1460. }
  1461. }
  1462. return !NoErrorOnBadEncoding;
  1463. }
  1464. void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
  1465. hadError = true;
  1466. if (Diags)
  1467. Diags->Report(Loc, diag::err_lexing_string);
  1468. }
  1469. /// getOffsetOfStringByte - This function returns the offset of the
  1470. /// specified byte of the string data represented by Token. This handles
  1471. /// advancing over escape sequences in the string.
  1472. unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
  1473. unsigned ByteNo) const {
  1474. // Get the spelling of the token.
  1475. SmallString<32> SpellingBuffer;
  1476. SpellingBuffer.resize(Tok.getLength());
  1477. bool StringInvalid = false;
  1478. const char *SpellingPtr = &SpellingBuffer[0];
  1479. unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
  1480. &StringInvalid);
  1481. if (StringInvalid)
  1482. return 0;
  1483. const char *SpellingStart = SpellingPtr;
  1484. const char *SpellingEnd = SpellingPtr+TokLen;
  1485. // Handle UTF-8 strings just like narrow strings.
  1486. if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
  1487. SpellingPtr += 2;
  1488. assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
  1489. SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
  1490. // For raw string literals, this is easy.
  1491. if (SpellingPtr[0] == 'R') {
  1492. assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
  1493. // Skip 'R"'.
  1494. SpellingPtr += 2;
  1495. while (*SpellingPtr != '(') {
  1496. ++SpellingPtr;
  1497. assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
  1498. }
  1499. // Skip '('.
  1500. ++SpellingPtr;
  1501. return SpellingPtr - SpellingStart + ByteNo;
  1502. }
  1503. // Skip over the leading quote
  1504. assert(SpellingPtr[0] == '"' && "Should be a string literal!");
  1505. ++SpellingPtr;
  1506. // Skip over bytes until we find the offset we're looking for.
  1507. while (ByteNo) {
  1508. assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
  1509. // Step over non-escapes simply.
  1510. if (*SpellingPtr != '\\') {
  1511. ++SpellingPtr;
  1512. --ByteNo;
  1513. continue;
  1514. }
  1515. // Otherwise, this is an escape character. Advance over it.
  1516. bool HadError = false;
  1517. if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
  1518. const char *EscapePtr = SpellingPtr;
  1519. unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
  1520. 1, Features, HadError);
  1521. if (Len > ByteNo) {
  1522. // ByteNo is somewhere within the escape sequence.
  1523. SpellingPtr = EscapePtr;
  1524. break;
  1525. }
  1526. ByteNo -= Len;
  1527. } else {
  1528. ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
  1529. FullSourceLoc(Tok.getLocation(), SM),
  1530. CharByteWidth*8, Diags, Features);
  1531. --ByteNo;
  1532. }
  1533. assert(!HadError && "This method isn't valid on erroneous strings");
  1534. }
  1535. return SpellingPtr-SpellingStart;
  1536. }