2
0

tokenizer.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2012 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #include "core/tokenizer.h"
  23. #include "platform/platform.h"
  24. #include "core/stream/fileStream.h"
  25. #include "core/strings/stringFunctions.h"
  26. #include "core/util/safeDelete.h"
  27. Tokenizer::Tokenizer()
  28. {
  29. dMemset(mFileName, 0, sizeof(mFileName));
  30. mpBuffer = NULL;
  31. mBufferSize = 0;
  32. mStartPos = 0;
  33. mCurrPos = 0;
  34. mTokenIsQuoted = false;
  35. dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer));
  36. mTokenIsCurrent = false;
  37. mSingleTokens = NULL;
  38. VECTOR_SET_ASSOCIATION(mLinePositions);
  39. }
  40. Tokenizer::~Tokenizer()
  41. {
  42. clear();
  43. }
  44. bool Tokenizer::openFile(const char* pFileName)
  45. {
  46. AssertFatal(mFileName[0] == '\0', "Reuse of Tokenizers not allowed!");
  47. FileStream* pStream = new FileStream;
  48. if (pStream->open(pFileName, Torque::FS::File::Read) == false)
  49. {
  50. delete pStream;
  51. return false;
  52. }
  53. dStrcpy(mFileName, pFileName, 1024);
  54. mBufferSize = pStream->getStreamSize();
  55. mpBuffer = new char[mBufferSize];
  56. pStream->read(mBufferSize, mpBuffer);
  57. pStream->close();
  58. delete pStream;
  59. reset();
  60. buildLinePositions();
  61. return true;
  62. }
  63. bool Tokenizer::openFile(Stream* pStream)
  64. {
  65. mBufferSize = pStream->getStreamSize();
  66. mpBuffer = new char[mBufferSize];
  67. pStream->read(mBufferSize, mpBuffer);
  68. reset();
  69. buildLinePositions();
  70. return true;
  71. }
  72. void Tokenizer::setBuffer(const char* buffer, U32 bufferSize)
  73. {
  74. if (mpBuffer)
  75. {
  76. SAFE_DELETE_ARRAY(mpBuffer);
  77. mBufferSize = 0;
  78. }
  79. mBufferSize = bufferSize;
  80. mpBuffer = new char[mBufferSize + 1];
  81. dStrcpy(mpBuffer, buffer, mBufferSize + 1);
  82. reset();
  83. buildLinePositions();
  84. }
  85. void Tokenizer::setSingleTokens(const char* singleTokens)
  86. {
  87. if (mSingleTokens)
  88. {
  89. free(mSingleTokens);
  90. mSingleTokens = NULL;
  91. }
  92. if (singleTokens)
  93. mSingleTokens = dStrdup(singleTokens);
  94. }
  95. bool Tokenizer::reset()
  96. {
  97. mStartPos = 0;
  98. mCurrPos = 0;
  99. mTokenIsQuoted = false;
  100. dMemset(mCurrTokenBuffer, 0, sizeof(mCurrTokenBuffer));
  101. mTokenIsCurrent = false;
  102. return true;
  103. }
  104. bool Tokenizer::clear()
  105. {
  106. // Delete our buffer
  107. if (mpBuffer)
  108. SAFE_DELETE_ARRAY(mpBuffer);
  109. // Reset the buffer size
  110. mBufferSize = 0;
  111. // Reset our active data
  112. reset();
  113. // Clear our line positions
  114. mLinePositions.clear();
  115. // Reset our file name
  116. dMemset(mFileName, 0, 1024);
  117. // Wipe the single tokens
  118. setSingleTokens(NULL);
  119. return true;
  120. }
  121. bool Tokenizer::setCurrentPos(U32 pos)
  122. {
  123. mCurrPos = pos;
  124. mTokenIsCurrent = false;
  125. return advanceToken(true);
  126. }
  127. void Tokenizer::buildLinePositions()
  128. {
  129. if (mBufferSize == 0)
  130. return;
  131. // We can safely assume that the first line is at position 0
  132. mLinePositions.push_back(0);
  133. U32 currPos = 0;
  134. while (currPos + 1 < mBufferSize)
  135. {
  136. // Windows line ending
  137. if (mpBuffer[currPos] == '\r' && mpBuffer[currPos + 1] == '\n')
  138. {
  139. currPos += 2;
  140. mLinePositions.push_back(currPos);
  141. }
  142. // Not sure if this ever happens but just in case
  143. else if (mpBuffer[currPos] == '\n' && mpBuffer[currPos + 1] == '\r')
  144. {
  145. currPos += 2;
  146. mLinePositions.push_back(currPos);
  147. }
  148. // Unix line endings should only have a single line break character
  149. else if (mpBuffer[currPos] == '\n' || mpBuffer[currPos] == '\r')
  150. {
  151. currPos++;
  152. mLinePositions.push_back(currPos);
  153. }
  154. else
  155. currPos++;
  156. }
  157. }
  158. U32 Tokenizer::getLinePosition(const U32 pos, U32 lowIndex, S32 highIndex)
  159. {
  160. // If we have one or less lines then
  161. // the result is easy
  162. if (mLinePositions.size() <= 1)
  163. return 0;
  164. // Now that we know we have at least one position
  165. // we can do a quick test against the last line
  166. if (pos >= mLinePositions.last())
  167. return mLinePositions.size() - 1;
  168. // If this is the beginning of the search
  169. // set a good starting point (the middle)
  170. if (highIndex < 0)
  171. highIndex = mLinePositions.size() - 1;
  172. // Just in case bad values got handed in
  173. if (lowIndex > highIndex)
  174. lowIndex = highIndex;
  175. // Compute our test index (middle)
  176. U32 testIndex = (lowIndex + highIndex) / 2;
  177. // Make sure that our test indices are valid
  178. if (testIndex >= mLinePositions.size() ||
  179. testIndex + 1 >= mLinePositions.size())
  180. return mLinePositions.size() - 1;
  181. // See if we are already at the right line
  182. if (pos >= mLinePositions[testIndex] && pos < mLinePositions[testIndex + 1])
  183. return testIndex;
  184. if (pos < mLinePositions[testIndex])
  185. highIndex = testIndex;
  186. else
  187. lowIndex = testIndex;
  188. return getLinePosition(pos, lowIndex, highIndex);
  189. }
  190. U32 Tokenizer::getCurrentLine()
  191. {
  192. // Binary search for the line number whose
  193. // position is equal to or lower than the
  194. // current position
  195. return getLinePosition(mStartPos);
  196. }
  197. U32 Tokenizer::getTokenLineOffset()
  198. {
  199. U32 lineNumber = getCurrentLine();
  200. if (lineNumber >= mLinePositions.size())
  201. return 0;
  202. U32 linePosition = mLinePositions[lineNumber];
  203. if (linePosition >= mStartPos)
  204. return 0;
  205. return mStartPos - linePosition;
  206. }
  207. bool Tokenizer::advanceToken(const bool crossLine, const bool assertAvail)
  208. {
  209. if (mTokenIsCurrent == true)
  210. {
  211. AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?");
  212. mTokenIsCurrent = false;
  213. return true;
  214. }
  215. U32 currPosition = 0;
  216. mCurrTokenBuffer[0] = '\0';
  217. mTokenIsQuoted = false;
  218. // Store the beginning of the previous advance
  219. // and the beginning of the current advance
  220. mStartPos = mCurrPos;
  221. while (mCurrPos < mBufferSize)
  222. {
  223. char c = mpBuffer[mCurrPos];
  224. bool cont = true;
  225. if (mSingleTokens && dStrchr(mSingleTokens, c))
  226. {
  227. if (currPosition == 0)
  228. {
  229. mCurrTokenBuffer[currPosition++] = c;
  230. mCurrPos++;
  231. cont = false;
  232. break;
  233. }
  234. else
  235. {
  236. // End of token
  237. cont = false;
  238. }
  239. }
  240. else
  241. {
  242. switch (c)
  243. {
  244. case ' ':
  245. case '\t':
  246. if (currPosition == 0)
  247. {
  248. // Token hasn't started yet...
  249. mCurrPos++;
  250. }
  251. else
  252. {
  253. // End of token
  254. mCurrPos++;
  255. cont = false;
  256. }
  257. break;
  258. case '\r':
  259. case '\n':
  260. if (crossLine == true)
  261. {
  262. // Windows line ending
  263. if (mpBuffer[mCurrPos] == '\r' && mpBuffer[mCurrPos + 1] == '\n')
  264. mCurrPos += 2;
  265. // Not sure if this ever happens but just in case
  266. else if (mpBuffer[mCurrPos] == '\n' && mpBuffer[mCurrPos + 1] == '\r')
  267. mCurrPos += 2;
  268. // Unix line endings should only have a single line break character
  269. else
  270. mCurrPos++;
  271. }
  272. else
  273. {
  274. cont = false;
  275. break;
  276. }
  277. break;
  278. default:
  279. if (c == '\"' || c == '\'')
  280. {
  281. // Quoted token
  282. U32 startLine = getCurrentLine();
  283. mCurrPos++;
  284. // Store the beginning of the token
  285. mStartPos = mCurrPos;
  286. while (mpBuffer[mCurrPos] != c)
  287. {
  288. AssertISV(mCurrPos < mBufferSize,
  289. avar("End of file before quote closed. Quote started: (%s: %d)",
  290. getFileName(), startLine));
  291. AssertISV((mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'),
  292. avar("End of line reached before end of quote. Quote started: (%s: %d)",
  293. getFileName(), startLine));
  294. mCurrTokenBuffer[currPosition++] = mpBuffer[mCurrPos++];
  295. }
  296. mTokenIsQuoted = true;
  297. mCurrPos++;
  298. cont = false;
  299. }
  300. else if (c == '/' && mpBuffer[mCurrPos+1] == '/')
  301. {
  302. // Line quote...
  303. if (currPosition == 0)
  304. {
  305. // continue to end of line, then let crossLine determine on the next pass
  306. while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'))
  307. mCurrPos++;
  308. }
  309. else
  310. {
  311. // This is the end of the token. Continue to EOL
  312. while (mCurrPos < mBufferSize && (mpBuffer[mCurrPos] != '\n' && mpBuffer[mCurrPos] != '\r'))
  313. mCurrPos++;
  314. cont = false;
  315. }
  316. }
  317. else if (c == '/' && mpBuffer[mCurrPos+1] == '*')
  318. {
  319. // Block quote...
  320. if (currPosition == 0)
  321. {
  322. // continue to end of block, then let crossLine determine on the next pass
  323. while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/'))
  324. mCurrPos++;
  325. if (mCurrPos < mBufferSize - 1)
  326. mCurrPos += 2;
  327. }
  328. else
  329. {
  330. // This is the end of the token. Continue to EOL
  331. while (mCurrPos < mBufferSize - 1 && (mpBuffer[mCurrPos] != '*' || mpBuffer[mCurrPos + 1] != '/'))
  332. mCurrPos++;
  333. if (mCurrPos < mBufferSize - 1)
  334. mCurrPos += 2;
  335. cont = false;
  336. }
  337. }
  338. else
  339. {
  340. // If this is the first non-token character then store the
  341. // beginning of the token
  342. if (currPosition == 0)
  343. mStartPos = mCurrPos;
  344. mCurrTokenBuffer[currPosition++] = c;
  345. mCurrPos++;
  346. }
  347. break;
  348. }
  349. }
  350. if (cont == false)
  351. break;
  352. }
  353. mCurrTokenBuffer[currPosition] = '\0';
  354. if (assertAvail == true)
  355. AssertISV(currPosition != 0, avar("Error parsing: %s at or around line: %d", getFileName(), getCurrentLine()));
  356. if (mCurrPos == mBufferSize)
  357. return false;
  358. return true;
  359. }
  360. bool Tokenizer::regressToken(const bool crossLine)
  361. {
  362. if (mTokenIsCurrent == true)
  363. {
  364. AssertFatal(mCurrTokenBuffer[0] != '\0', "No token, but marked as current?");
  365. mTokenIsCurrent = false;
  366. return true;
  367. }
  368. U32 currPosition = 0;
  369. mCurrTokenBuffer[0] = '\0';
  370. mTokenIsQuoted = false;
  371. // Store the beginning of the previous advance
  372. // and the beginning of the current advance
  373. mCurrPos = mStartPos;
  374. // Back up to the first character of the previous token
  375. mStartPos--;
  376. while (mStartPos > 0)
  377. {
  378. char c = mpBuffer[mStartPos];
  379. bool cont = true;
  380. if (mSingleTokens && dStrchr(mSingleTokens, c))
  381. {
  382. if (currPosition == 0)
  383. {
  384. mCurrTokenBuffer[currPosition++] = c;
  385. mStartPos--;
  386. cont = false;
  387. break;
  388. }
  389. else
  390. {
  391. // End of token
  392. cont = false;
  393. }
  394. }
  395. else
  396. {
  397. switch (c)
  398. {
  399. case ' ':
  400. case '\t':
  401. if (currPosition == 0)
  402. {
  403. // Token hasn't started yet...
  404. mStartPos--;
  405. }
  406. else
  407. {
  408. // End of token
  409. mStartPos--;
  410. cont = false;
  411. }
  412. break;
  413. case '\r':
  414. case '\n':
  415. if (crossLine == true && currPosition == 0)
  416. {
  417. // Windows line ending
  418. if (mStartPos > 0 && mpBuffer[mStartPos] == '\r' && mpBuffer[mStartPos - 1] == '\n')
  419. mStartPos -= 2;
  420. // Not sure if this ever happens but just in case
  421. else if (mStartPos > 0 && mpBuffer[mStartPos] == '\n' && mpBuffer[mStartPos - 1] == '\r')
  422. mStartPos -= 2;
  423. // Unix line endings should only have a single line break character
  424. else
  425. mStartPos--;
  426. }
  427. else
  428. {
  429. cont = false;
  430. break;
  431. }
  432. break;
  433. default:
  434. if (c == '\"' || c == '\'')
  435. {
  436. // Quoted token
  437. U32 endLine = getCurrentLine();
  438. mStartPos--;
  439. while (mpBuffer[mStartPos] != c)
  440. {
  441. AssertISV(mStartPos < 0,
  442. avar("Beginning of file reached before finding begin quote. Quote ended: (%s: %d)",
  443. getFileName(), endLine));
  444. mCurrTokenBuffer[currPosition++] = mpBuffer[mStartPos--];
  445. }
  446. mTokenIsQuoted = true;
  447. mStartPos--;
  448. cont = false;
  449. }
  450. else if (c == '/' && mStartPos > 0 && mpBuffer[mStartPos - 1] == '/')
  451. {
  452. // Line quote...
  453. // Clear out anything saved already
  454. currPosition = 0;
  455. mStartPos -= 2;
  456. }
  457. else
  458. {
  459. mCurrTokenBuffer[currPosition++] = c;
  460. mStartPos--;
  461. }
  462. break;
  463. }
  464. }
  465. if (cont == false)
  466. break;
  467. }
  468. mCurrTokenBuffer[currPosition] = '\0';
  469. // Reveres the token
  470. for (U32 i = 0; i < currPosition / 2; i++)
  471. {
  472. char c = mCurrTokenBuffer[i];
  473. mCurrTokenBuffer[i] = mCurrTokenBuffer[currPosition - i - 1];
  474. mCurrTokenBuffer[currPosition - i - 1] = c;
  475. }
  476. mStartPos++;
  477. if (mStartPos == mCurrPos)
  478. return false;
  479. return true;
  480. }
  481. bool Tokenizer::tokenAvailable()
  482. {
  483. // Note: this implies that when advanceToken(false) fails, it must cap the
  484. // token buffer.
  485. //
  486. return mCurrTokenBuffer[0] != '\0';
  487. }
  488. const char* Tokenizer::getToken() const
  489. {
  490. return mCurrTokenBuffer;
  491. }
  492. const char* Tokenizer::getNextToken()
  493. {
  494. advanceToken(true);
  495. return getToken();
  496. }
  497. bool Tokenizer::tokenICmp(const char* pCmp) const
  498. {
  499. return dStricmp(mCurrTokenBuffer, pCmp) == 0;
  500. }
  501. bool Tokenizer::findToken(U32 start, const char* pCmp)
  502. {
  503. // Move to the start
  504. setCurrentPos(start);
  505. // In case the first token is what we are looking for
  506. if (tokenICmp(pCmp))
  507. return true;
  508. // Loop through the file and see if the token exists
  509. while (advanceToken(true))
  510. {
  511. if (tokenICmp(pCmp))
  512. return true;
  513. }
  514. return false;
  515. }
  516. bool Tokenizer::findToken(const char* pCmp)
  517. {
  518. return findToken(0, pCmp);
  519. }
  520. bool Tokenizer::endOfFile()
  521. {
  522. if (mCurrPos < mBufferSize)
  523. return false;
  524. else
  525. return true;
  526. }