TGLexer.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // Implement the Lexer for TableGen.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "TGLexer.h"
  14. #include "llvm/ADT/StringSwitch.h"
  15. #include "llvm/ADT/Twine.h"
  16. #include "llvm/Config/config.h" // for strtoull()/strtoll() define
  17. #include "llvm/Support/MemoryBuffer.h"
  18. #include "llvm/Support/SourceMgr.h"
  19. #include "llvm/TableGen/Error.h"
  20. #include <cctype>
  21. #include <cerrno>
  22. #include <cstdio>
  23. #include <cstdlib>
  24. #include <cstring>
  25. using namespace llvm;
  26. TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
  27. CurBuffer = SrcMgr.getMainFileID();
  28. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  29. CurPtr = CurBuf.begin();
  30. TokStart = nullptr;
  31. }
  32. SMLoc TGLexer::getLoc() const {
  33. return SMLoc::getFromPointer(TokStart);
  34. }
  35. /// ReturnError - Set the error to the specified string at the specified
  36. /// location. This is defined to always return tgtok::Error.
  37. tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
  38. PrintError(Loc, Msg);
  39. return tgtok::Error;
  40. }
  41. int TGLexer::getNextChar() {
  42. char CurChar = *CurPtr++;
  43. switch (CurChar) {
  44. default:
  45. return (unsigned char)CurChar;
  46. case 0: {
  47. // A nul character in the stream is either the end of the current buffer or
  48. // a random nul in the file. Disambiguate that here.
  49. if (CurPtr-1 != CurBuf.end())
  50. return 0; // Just whitespace.
  51. // If this is the end of an included file, pop the parent file off the
  52. // include stack.
  53. SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
  54. if (ParentIncludeLoc != SMLoc()) {
  55. CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
  56. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  57. CurPtr = ParentIncludeLoc.getPointer();
  58. return getNextChar();
  59. }
  60. // Otherwise, return end of file.
  61. --CurPtr; // Another call to lex will return EOF again.
  62. return EOF;
  63. }
  64. case '\n':
  65. case '\r':
  66. // Handle the newline character by ignoring it and incrementing the line
  67. // count. However, be careful about 'dos style' files with \n\r in them.
  68. // Only treat a \n\r or \r\n as a single line.
  69. if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
  70. *CurPtr != CurChar)
  71. ++CurPtr; // Eat the two char newline sequence.
  72. return '\n';
  73. }
  74. }
  75. int TGLexer::peekNextChar(int Index) {
  76. return *(CurPtr + Index);
  77. }
  78. tgtok::TokKind TGLexer::LexToken() {
  79. TokStart = CurPtr;
  80. // This always consumes at least one character.
  81. int CurChar = getNextChar();
  82. switch (CurChar) {
  83. default:
  84. // Handle letters: [a-zA-Z_]
  85. if (isalpha(CurChar) || CurChar == '_')
  86. return LexIdentifier();
  87. // Unknown character, emit an error.
  88. return ReturnError(TokStart, "Unexpected character");
  89. case EOF: return tgtok::Eof;
  90. case ':': return tgtok::colon;
  91. case ';': return tgtok::semi;
  92. case '.': return tgtok::period;
  93. case ',': return tgtok::comma;
  94. case '<': return tgtok::less;
  95. case '>': return tgtok::greater;
  96. case ']': return tgtok::r_square;
  97. case '{': return tgtok::l_brace;
  98. case '}': return tgtok::r_brace;
  99. case '(': return tgtok::l_paren;
  100. case ')': return tgtok::r_paren;
  101. case '=': return tgtok::equal;
  102. case '?': return tgtok::question;
  103. case '#': return tgtok::paste;
  104. case 0:
  105. case ' ':
  106. case '\t':
  107. case '\n':
  108. case '\r':
  109. // Ignore whitespace.
  110. return LexToken();
  111. case '/':
  112. // If this is the start of a // comment, skip until the end of the line or
  113. // the end of the buffer.
  114. if (*CurPtr == '/')
  115. SkipBCPLComment();
  116. else if (*CurPtr == '*') {
  117. if (SkipCComment())
  118. return tgtok::Error;
  119. } else // Otherwise, this is an error.
  120. return ReturnError(TokStart, "Unexpected character");
  121. return LexToken();
  122. case '-': case '+':
  123. case '0': case '1': case '2': case '3': case '4': case '5': case '6':
  124. case '7': case '8': case '9': {
  125. int NextChar = 0;
  126. if (isdigit(CurChar)) {
  127. // Allow identifiers to start with a number if it is followed by
  128. // an identifier. This can happen with paste operations like
  129. // foo#8i.
  130. int i = 0;
  131. do {
  132. NextChar = peekNextChar(i++);
  133. } while (isdigit(NextChar));
  134. if (NextChar == 'x' || NextChar == 'b') {
  135. // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
  136. // likely a number.
  137. int NextNextChar = peekNextChar(i);
  138. switch (NextNextChar) {
  139. default:
  140. break;
  141. case '0': case '1':
  142. if (NextChar == 'b')
  143. return LexNumber();
  144. // Fallthrough
  145. case '2': case '3': case '4': case '5':
  146. case '6': case '7': case '8': case '9':
  147. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
  148. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
  149. if (NextChar == 'x')
  150. return LexNumber();
  151. break;
  152. }
  153. }
  154. }
  155. if (isalpha(NextChar) || NextChar == '_')
  156. return LexIdentifier();
  157. return LexNumber();
  158. }
  159. case '"': return LexString();
  160. case '$': return LexVarName();
  161. case '[': return LexBracket();
  162. case '!': return LexExclaim();
  163. }
  164. }
  165. /// LexString - Lex "[^"]*"
  166. tgtok::TokKind TGLexer::LexString() {
  167. const char *StrStart = CurPtr;
  168. CurStrVal = "";
  169. while (*CurPtr != '"') {
  170. // If we hit the end of the buffer, report an error.
  171. if (*CurPtr == 0 && CurPtr == CurBuf.end())
  172. return ReturnError(StrStart, "End of file in string literal");
  173. if (*CurPtr == '\n' || *CurPtr == '\r')
  174. return ReturnError(StrStart, "End of line in string literal");
  175. if (*CurPtr != '\\') {
  176. CurStrVal += *CurPtr++;
  177. continue;
  178. }
  179. ++CurPtr;
  180. switch (*CurPtr) {
  181. case '\\': case '\'': case '"':
  182. // These turn into their literal character.
  183. CurStrVal += *CurPtr++;
  184. break;
  185. case 't':
  186. CurStrVal += '\t';
  187. ++CurPtr;
  188. break;
  189. case 'n':
  190. CurStrVal += '\n';
  191. ++CurPtr;
  192. break;
  193. case '\n':
  194. case '\r':
  195. return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
  196. // If we hit the end of the buffer, report an error.
  197. case '\0':
  198. if (CurPtr == CurBuf.end())
  199. return ReturnError(StrStart, "End of file in string literal");
  200. // FALL THROUGH
  201. default:
  202. return ReturnError(CurPtr, "invalid escape in string literal");
  203. }
  204. }
  205. ++CurPtr;
  206. return tgtok::StrVal;
  207. }
  208. tgtok::TokKind TGLexer::LexVarName() {
  209. if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
  210. return ReturnError(TokStart, "Invalid variable name");
  211. // Otherwise, we're ok, consume the rest of the characters.
  212. const char *VarNameStart = CurPtr++;
  213. while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
  214. ++CurPtr;
  215. CurStrVal.assign(VarNameStart, CurPtr);
  216. return tgtok::VarName;
  217. }
  218. tgtok::TokKind TGLexer::LexIdentifier() {
  219. // The first letter is [a-zA-Z_#].
  220. const char *IdentStart = TokStart;
  221. // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
  222. while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
  223. ++CurPtr;
  224. // Check to see if this identifier is a keyword.
  225. StringRef Str(IdentStart, CurPtr-IdentStart);
  226. if (Str == "include") {
  227. if (LexInclude()) return tgtok::Error;
  228. return Lex();
  229. }
  230. tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
  231. .Case("int", tgtok::Int)
  232. .Case("bit", tgtok::Bit)
  233. .Case("bits", tgtok::Bits)
  234. .Case("string", tgtok::String)
  235. .Case("list", tgtok::List)
  236. .Case("code", tgtok::Code)
  237. .Case("dag", tgtok::Dag)
  238. .Case("class", tgtok::Class)
  239. .Case("def", tgtok::Def)
  240. .Case("foreach", tgtok::Foreach)
  241. .Case("defm", tgtok::Defm)
  242. .Case("multiclass", tgtok::MultiClass)
  243. .Case("field", tgtok::Field)
  244. .Case("let", tgtok::Let)
  245. .Case("in", tgtok::In)
  246. .Default(tgtok::Id);
  247. if (Kind == tgtok::Id)
  248. CurStrVal.assign(Str.begin(), Str.end());
  249. return Kind;
  250. }
  251. /// LexInclude - We just read the "include" token. Get the string token that
  252. /// comes next and enter the include.
  253. bool TGLexer::LexInclude() {
  254. // The token after the include must be a string.
  255. tgtok::TokKind Tok = LexToken();
  256. if (Tok == tgtok::Error) return true;
  257. if (Tok != tgtok::StrVal) {
  258. PrintError(getLoc(), "Expected filename after include");
  259. return true;
  260. }
  261. // Get the string.
  262. std::string Filename = CurStrVal;
  263. std::string IncludedFile;
  264. CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
  265. IncludedFile);
  266. if (!CurBuffer) {
  267. PrintError(getLoc(), "Could not find include file '" + Filename + "'");
  268. return true;
  269. }
  270. DependenciesMapTy::const_iterator Found = Dependencies.find(IncludedFile);
  271. if (Found != Dependencies.end()) {
  272. PrintError(getLoc(),
  273. "File '" + IncludedFile + "' has already been included.");
  274. SrcMgr.PrintMessage(Found->second, SourceMgr::DK_Note,
  275. "previously included here");
  276. return true;
  277. }
  278. Dependencies.insert(std::make_pair(IncludedFile, getLoc()));
  279. // Save the line number and lex buffer of the includer.
  280. CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
  281. CurPtr = CurBuf.begin();
  282. return false;
  283. }
  284. void TGLexer::SkipBCPLComment() {
  285. ++CurPtr; // skip the second slash.
  286. while (1) {
  287. switch (*CurPtr) {
  288. case '\n':
  289. case '\r':
  290. return; // Newline is end of comment.
  291. case 0:
  292. // If this is the end of the buffer, end the comment.
  293. if (CurPtr == CurBuf.end())
  294. return;
  295. break;
  296. }
  297. // Otherwise, skip the character.
  298. ++CurPtr;
  299. }
  300. }
  301. /// SkipCComment - This skips C-style /**/ comments. The only difference from C
  302. /// is that we allow nesting.
  303. bool TGLexer::SkipCComment() {
  304. ++CurPtr; // skip the star.
  305. unsigned CommentDepth = 1;
  306. while (1) {
  307. int CurChar = getNextChar();
  308. switch (CurChar) {
  309. case EOF:
  310. PrintError(TokStart, "Unterminated comment!");
  311. return true;
  312. case '*':
  313. // End of the comment?
  314. if (CurPtr[0] != '/') break;
  315. ++CurPtr; // End the */.
  316. if (--CommentDepth == 0)
  317. return false;
  318. break;
  319. case '/':
  320. // Start of a nested comment?
  321. if (CurPtr[0] != '*') break;
  322. ++CurPtr;
  323. ++CommentDepth;
  324. break;
  325. }
  326. }
  327. }
  328. /// LexNumber - Lex:
  329. /// [-+]?[0-9]+
  330. /// 0x[0-9a-fA-F]+
  331. /// 0b[01]+
  332. tgtok::TokKind TGLexer::LexNumber() {
  333. if (CurPtr[-1] == '0') {
  334. if (CurPtr[0] == 'x') {
  335. ++CurPtr;
  336. const char *NumStart = CurPtr;
  337. while (isxdigit(CurPtr[0]))
  338. ++CurPtr;
  339. // Requires at least one hex digit.
  340. if (CurPtr == NumStart)
  341. return ReturnError(TokStart, "Invalid hexadecimal number");
  342. errno = 0;
  343. CurIntVal = strtoll(NumStart, nullptr, 16);
  344. if (errno == EINVAL)
  345. return ReturnError(TokStart, "Invalid hexadecimal number");
  346. if (errno == ERANGE) {
  347. errno = 0;
  348. CurIntVal = (int64_t)strtoull(NumStart, nullptr, 16);
  349. if (errno == EINVAL)
  350. return ReturnError(TokStart, "Invalid hexadecimal number");
  351. if (errno == ERANGE)
  352. return ReturnError(TokStart, "Hexadecimal number out of range");
  353. }
  354. return tgtok::IntVal;
  355. } else if (CurPtr[0] == 'b') {
  356. ++CurPtr;
  357. const char *NumStart = CurPtr;
  358. while (CurPtr[0] == '0' || CurPtr[0] == '1')
  359. ++CurPtr;
  360. // Requires at least one binary digit.
  361. if (CurPtr == NumStart)
  362. return ReturnError(CurPtr-2, "Invalid binary number");
  363. CurIntVal = strtoll(NumStart, nullptr, 2);
  364. return tgtok::BinaryIntVal;
  365. }
  366. }
  367. // Check for a sign without a digit.
  368. if (!isdigit(CurPtr[0])) {
  369. if (CurPtr[-1] == '-')
  370. return tgtok::minus;
  371. else if (CurPtr[-1] == '+')
  372. return tgtok::plus;
  373. }
  374. while (isdigit(CurPtr[0]))
  375. ++CurPtr;
  376. CurIntVal = strtoll(TokStart, nullptr, 10);
  377. return tgtok::IntVal;
  378. }
  379. /// LexBracket - We just read '['. If this is a code block, return it,
  380. /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
  381. tgtok::TokKind TGLexer::LexBracket() {
  382. if (CurPtr[0] != '{')
  383. return tgtok::l_square;
  384. ++CurPtr;
  385. const char *CodeStart = CurPtr;
  386. while (1) {
  387. int Char = getNextChar();
  388. if (Char == EOF) break;
  389. if (Char != '}') continue;
  390. Char = getNextChar();
  391. if (Char == EOF) break;
  392. if (Char == ']') {
  393. CurStrVal.assign(CodeStart, CurPtr-2);
  394. return tgtok::CodeFragment;
  395. }
  396. }
  397. return ReturnError(CodeStart-2, "Unterminated Code Block");
  398. }
  399. /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
  400. tgtok::TokKind TGLexer::LexExclaim() {
  401. if (!isalpha(*CurPtr))
  402. return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
  403. const char *Start = CurPtr++;
  404. while (isalpha(*CurPtr))
  405. ++CurPtr;
  406. // Check to see which operator this is.
  407. tgtok::TokKind Kind =
  408. StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
  409. .Case("eq", tgtok::XEq)
  410. .Case("if", tgtok::XIf)
  411. .Case("head", tgtok::XHead)
  412. .Case("tail", tgtok::XTail)
  413. .Case("con", tgtok::XConcat)
  414. .Case("add", tgtok::XADD)
  415. .Case("and", tgtok::XAND)
  416. .Case("shl", tgtok::XSHL)
  417. .Case("sra", tgtok::XSRA)
  418. .Case("srl", tgtok::XSRL)
  419. .Case("cast", tgtok::XCast)
  420. .Case("empty", tgtok::XEmpty)
  421. .Case("subst", tgtok::XSubst)
  422. .Case("foreach", tgtok::XForEach)
  423. .Case("listconcat", tgtok::XListConcat)
  424. .Case("strconcat", tgtok::XStrConcat)
  425. .Default(tgtok::Error);
  426. return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
  427. }