CXMLReaderImpl.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809
  1. // Copyright (C) 2002-2005 Nikolaus Gebhardt
  2. // This file is part of the "Irrlicht Engine" and the "irrXML" project.
  3. // For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h
  4. #ifndef __ICXML_READER_IMPL_H_INCLUDED__
  5. #define __ICXML_READER_IMPL_H_INCLUDED__
  6. #include "irrXML.h"
  7. #include "irrString.h"
  8. #include "irrArray.h"
  9. using namespace Assimp;
  10. #ifdef _DEBUG
  11. #define IRR_DEBUGPRINT(x) printf((x));
  12. #else // _DEBUG
  13. #define IRR_DEBUGPRINT(x)
  14. #endif // _DEBUG
  15. namespace irr
  16. {
  17. namespace io
  18. {
  19. //! implementation of the IrrXMLReader
  20. template<class char_type, class superclass>
  21. class CXMLReaderImpl : public IIrrXMLReader<char_type, superclass>
  22. {
  23. public:
  24. //! Constructor
  25. CXMLReaderImpl(IFileReadCallBack* callback, bool deleteCallBack = true)
  26. : TextData(0), P(0), TextBegin(0), TextSize(0), CurrentNodeType(EXN_NONE),
  27. SourceFormat(ETF_ASCII), TargetFormat(ETF_ASCII)
  28. {
  29. if (!callback)
  30. return;
  31. storeTargetFormat();
  32. // read whole xml file
  33. readFile(callback);
  34. // clean up
  35. if (deleteCallBack)
  36. delete callback;
  37. // create list with special characters
  38. createSpecialCharacterList();
  39. // set pointer to text begin
  40. P = TextBegin;
  41. }
  42. //! Destructor
  43. virtual ~CXMLReaderImpl()
  44. {
  45. delete [] TextData;
  46. }
  47. //! Reads forward to the next xml node.
  48. //! \return Returns false, if there was no further node.
  49. virtual bool read()
  50. {
  51. // if not end reached, parse the node
  52. if (P && (unsigned int)(P - TextBegin) < TextSize - 1 && *P != 0)
  53. {
  54. parseCurrentNode();
  55. return true;
  56. }
  57. _IRR_IMPLEMENT_MANAGED_MARSHALLING_BUGFIX;
  58. return false;
  59. }
  60. //! Returns the type of the current XML node.
  61. virtual EXML_NODE getNodeType() const
  62. {
  63. return CurrentNodeType;
  64. }
  65. //! Returns attribute count of the current XML node.
  66. virtual int getAttributeCount() const
  67. {
  68. return Attributes.size();
  69. }
  70. //! Returns name of an attribute.
  71. virtual const char_type* getAttributeName(int idx) const
  72. {
  73. if (idx < 0 || idx >= (int)Attributes.size())
  74. return 0;
  75. return Attributes[idx].Name.c_str();
  76. }
  77. //! Returns the value of an attribute.
  78. virtual const char_type* getAttributeValue(int idx) const
  79. {
  80. if (idx < 0 || idx >= (int)Attributes.size())
  81. return 0;
  82. return Attributes[idx].Value.c_str();
  83. }
  84. //! Returns the value of an attribute.
  85. virtual const char_type* getAttributeValue(const char_type* name) const
  86. {
  87. const SAttribute* attr = getAttributeByName(name);
  88. if (!attr)
  89. return 0;
  90. return attr->Value.c_str();
  91. }
  92. //! Returns the value of an attribute
  93. virtual const char_type* getAttributeValueSafe(const char_type* name) const
  94. {
  95. const SAttribute* attr = getAttributeByName(name);
  96. if (!attr)
  97. return EmptyString.c_str();
  98. return attr->Value.c_str();
  99. }
  100. //! Returns the value of an attribute as integer.
  101. int getAttributeValueAsInt(const char_type* name) const
  102. {
  103. return (int)getAttributeValueAsFloat(name);
  104. }
  105. //! Returns the value of an attribute as integer.
  106. int getAttributeValueAsInt(int idx) const
  107. {
  108. return (int)getAttributeValueAsFloat(idx);
  109. }
  110. //! Returns the value of an attribute as float.
  111. float getAttributeValueAsFloat(const char_type* name) const
  112. {
  113. const SAttribute* attr = getAttributeByName(name);
  114. if (!attr)
  115. return 0;
  116. core::stringc c = attr->Value.c_str();
  117. return fast_atof(c.c_str());
  118. }
  119. //! Returns the value of an attribute as float.
  120. float getAttributeValueAsFloat(int idx) const
  121. {
  122. const char_type* attrvalue = getAttributeValue(idx);
  123. if (!attrvalue)
  124. return 0;
  125. core::stringc c = attrvalue;
  126. return fast_atof(c.c_str());
  127. }
  128. //! Returns the name of the current node.
  129. virtual const char_type* getNodeName() const
  130. {
  131. return NodeName.c_str();
  132. }
  133. //! Returns data of the current node.
  134. virtual const char_type* getNodeData() const
  135. {
  136. return NodeName.c_str();
  137. }
  138. //! Returns if an element is an empty element, like <foo />
  139. virtual bool isEmptyElement() const
  140. {
  141. return IsEmptyElement;
  142. }
  143. //! Returns format of the source xml file.
  144. virtual ETEXT_FORMAT getSourceFormat() const
  145. {
  146. return SourceFormat;
  147. }
  148. //! Returns format of the strings returned by the parser.
  149. virtual ETEXT_FORMAT getParserFormat() const
  150. {
  151. return TargetFormat;
  152. }
  153. private:
  154. // Reads the current xml node
  155. void parseCurrentNode()
  156. {
  157. char_type* start = P;
  158. // move forward until '<' found
  159. while(*P != L'<' && *P)
  160. ++P;
  161. if (!*P)
  162. return;
  163. if (P - start > 0)
  164. {
  165. // we found some text, store it
  166. if (setText(start, P))
  167. return;
  168. }
  169. ++P;
  170. // based on current token, parse and report next element
  171. switch(*P)
  172. {
  173. case L'/':
  174. parseClosingXMLElement();
  175. break;
  176. case L'?':
  177. ignoreDefinition();
  178. break;
  179. case L'!':
  180. if (!parseCDATA())
  181. parseComment();
  182. break;
  183. default:
  184. parseOpeningXMLElement();
  185. break;
  186. }
  187. }
  188. //! sets the state that text was found. Returns true if set should be set
  189. bool setText(char_type* start, char_type* end)
  190. {
  191. // check if text is more than 2 characters, and if not, check if there is
  192. // only white space, so that this text won't be reported
  193. if (end - start < 3)
  194. {
  195. char_type* p = start;
  196. for(; p != end; ++p)
  197. if (!isWhiteSpace(*p))
  198. break;
  199. if (p == end)
  200. return false;
  201. }
  202. // set current text to the parsed text, and replace xml special characters
  203. core::string<char_type> s(start, (int)(end - start));
  204. NodeName = replaceSpecialCharacters(s);
  205. // current XML node type is text
  206. CurrentNodeType = EXN_TEXT;
  207. return true;
  208. }
  209. //! ignores an xml definition like <?xml something />
  210. void ignoreDefinition()
  211. {
  212. CurrentNodeType = EXN_UNKNOWN;
  213. // move until end marked with '>' reached
  214. while(*P != L'>')
  215. ++P;
  216. ++P;
  217. }
  218. //! parses a comment
  219. void parseComment()
  220. {
  221. CurrentNodeType = EXN_COMMENT;
  222. P += 1;
  223. char_type *pCommentBegin = P;
  224. int count = 1;
  225. // move until end of comment reached
  226. while(count)
  227. {
  228. if (*P == L'>')
  229. --count;
  230. else
  231. if (*P == L'<')
  232. ++count;
  233. ++P;
  234. }
  235. P -= 3;
  236. NodeName = core::string<char_type>(pCommentBegin+2, (int)(P - pCommentBegin-2));
  237. P += 3;
  238. }
  239. //! parses an opening xml element and reads attributes
  240. void parseOpeningXMLElement()
  241. {
  242. CurrentNodeType = EXN_ELEMENT;
  243. IsEmptyElement = false;
  244. Attributes.clear();
  245. // find name
  246. const char_type* startName = P;
  247. // find end of element
  248. while(*P != L'>' && !isWhiteSpace(*P))
  249. ++P;
  250. const char_type* endName = P;
  251. // find Attributes
  252. while(*P != L'>')
  253. {
  254. if (isWhiteSpace(*P))
  255. ++P;
  256. else
  257. {
  258. if (*P != L'/')
  259. {
  260. // we've got an attribute
  261. // read the attribute names
  262. const char_type* attributeNameBegin = P;
  263. while(!isWhiteSpace(*P) && *P != L'=')
  264. ++P;
  265. const char_type* attributeNameEnd = P;
  266. ++P;
  267. // read the attribute value
  268. // check for quotes and single quotes, thx to murphy
  269. while( (*P != L'\"') && (*P != L'\'') && *P)
  270. ++P;
  271. if (!*P) // malformatted xml file
  272. return;
  273. const char_type attributeQuoteChar = *P;
  274. ++P;
  275. const char_type* attributeValueBegin = P;
  276. while(*P != attributeQuoteChar && *P)
  277. ++P;
  278. if (!*P) // malformatted xml file
  279. return;
  280. const char_type* attributeValueEnd = P;
  281. ++P;
  282. SAttribute attr;
  283. attr.Name = core::string<char_type>(attributeNameBegin,
  284. (int)(attributeNameEnd - attributeNameBegin));
  285. core::string<char_type> s(attributeValueBegin,
  286. (int)(attributeValueEnd - attributeValueBegin));
  287. attr.Value = replaceSpecialCharacters(s);
  288. Attributes.push_back(attr);
  289. }
  290. else
  291. {
  292. // tag is closed directly
  293. ++P;
  294. IsEmptyElement = true;
  295. break;
  296. }
  297. }
  298. }
  299. // check if this tag is closing directly
  300. if (endName > startName && *(endName-1) == L'/')
  301. {
  302. // directly closing tag
  303. IsEmptyElement = true;
  304. endName--;
  305. }
  306. NodeName = core::string<char_type>(startName, (int)(endName - startName));
  307. ++P;
  308. }
  309. //! parses an closing xml tag
  310. void parseClosingXMLElement()
  311. {
  312. CurrentNodeType = EXN_ELEMENT_END;
  313. IsEmptyElement = false;
  314. Attributes.clear();
  315. ++P;
  316. const char_type* pBeginClose = P;
  317. while(*P != L'>')
  318. ++P;
  319. // remove trailing whitespace, if any
  320. while( isspace( P[-1]))
  321. --P;
  322. NodeName = core::string<char_type>(pBeginClose, (int)(P - pBeginClose));
  323. ++P;
  324. }
  325. //! parses a possible CDATA section, returns false if begin was not a CDATA section
  326. bool parseCDATA()
  327. {
  328. if (*(P+1) != L'[')
  329. return false;
  330. CurrentNodeType = EXN_CDATA;
  331. // skip '<![CDATA['
  332. int count=0;
  333. while( *P && count<8 )
  334. {
  335. ++P;
  336. ++count;
  337. }
  338. if (!*P)
  339. return true;
  340. char_type *cDataBegin = P;
  341. char_type *cDataEnd = 0;
  342. // find end of CDATA
  343. while(*P && !cDataEnd)
  344. {
  345. if (*P == L'>' &&
  346. (*(P-1) == L']') &&
  347. (*(P-2) == L']'))
  348. {
  349. cDataEnd = P - 2;
  350. }
  351. ++P;
  352. }
  353. if ( cDataEnd )
  354. NodeName = core::string<char_type>(cDataBegin, (int)(cDataEnd - cDataBegin));
  355. else
  356. NodeName = "";
  357. return true;
  358. }
  359. // structure for storing attribute-name pairs
  360. struct SAttribute
  361. {
  362. core::string<char_type> Name;
  363. core::string<char_type> Value;
  364. };
  365. // finds a current attribute by name, returns 0 if not found
  366. const SAttribute* getAttributeByName(const char_type* name) const
  367. {
  368. if (!name)
  369. return 0;
  370. core::string<char_type> n = name;
  371. for (int i=0; i<(int)Attributes.size(); ++i)
  372. if (Attributes[i].Name == n)
  373. return &Attributes[i];
  374. return 0;
  375. }
  376. // replaces xml special characters in a string and creates a new one
  377. core::string<char_type> replaceSpecialCharacters(
  378. core::string<char_type>& origstr)
  379. {
  380. int pos = origstr.findFirst(L'&');
  381. int oldPos = 0;
  382. if (pos == -1)
  383. return origstr;
  384. core::string<char_type> newstr;
  385. while(pos != -1 && pos < origstr.size()-2)
  386. {
  387. // check if it is one of the special characters
  388. int specialChar = -1;
  389. for (int i=0; i<(int)SpecialCharacters.size(); ++i)
  390. {
  391. const char_type* p = &origstr.c_str()[pos]+1;
  392. if (equalsn(&SpecialCharacters[i][1], p, SpecialCharacters[i].size()-1))
  393. {
  394. specialChar = i;
  395. break;
  396. }
  397. }
  398. if (specialChar != -1)
  399. {
  400. newstr.append(origstr.subString(oldPos, pos - oldPos));
  401. newstr.append(SpecialCharacters[specialChar][0]);
  402. pos += SpecialCharacters[specialChar].size();
  403. }
  404. else
  405. {
  406. newstr.append(origstr.subString(oldPos, pos - oldPos + 1));
  407. pos += 1;
  408. }
  409. // find next &
  410. oldPos = pos;
  411. pos = origstr.findNext(L'&', pos);
  412. }
  413. if (oldPos < origstr.size()-1)
  414. newstr.append(origstr.subString(oldPos, origstr.size()-oldPos));
  415. return newstr;
  416. }
  417. //! reads the xml file and converts it into the wanted character format.
  418. bool readFile(IFileReadCallBack* callback)
  419. {
  420. int size = callback->getSize();
  421. size += 4; // We need two terminating 0's at the end.
  422. // For ASCII we need 1 0's, for UTF-16 2, for UTF-32 4.
  423. char* data8 = new char[size];
  424. if (!callback->read(data8, size-4))
  425. {
  426. delete [] data8;
  427. return false;
  428. }
  429. // add zeros at end
  430. data8[size-1] = 0;
  431. data8[size-2] = 0;
  432. data8[size-3] = 0;
  433. data8[size-4] = 0;
  434. char16* data16 = reinterpret_cast<char16*>(data8);
  435. char32* data32 = reinterpret_cast<char32*>(data8);
  436. // now we need to convert the data to the desired target format
  437. // based on the byte order mark.
  438. const unsigned char UTF8[] = {0xEF, 0xBB, 0xBF}; // 0xEFBBBF;
  439. const int UTF16_BE = 0xFFFE;
  440. const int UTF16_LE = 0xFEFF;
  441. const int UTF32_BE = 0xFFFE0000;
  442. const int UTF32_LE = 0x0000FEFF;
  443. // check source for all utf versions and convert to target data format
  444. if (size >= 4 && data32[0] == (char32)UTF32_BE)
  445. {
  446. // UTF-32, big endian
  447. SourceFormat = ETF_UTF32_BE;
  448. convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
  449. }
  450. else
  451. if (size >= 4 && data32[0] == (char32)UTF32_LE)
  452. {
  453. // UTF-32, little endian
  454. SourceFormat = ETF_UTF32_LE;
  455. convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
  456. }
  457. else
  458. if (size >= 2 && data16[0] == UTF16_BE)
  459. {
  460. // UTF-16, big endian
  461. SourceFormat = ETF_UTF16_BE;
  462. convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
  463. }
  464. else
  465. if (size >= 2 && data16[0] == UTF16_LE)
  466. {
  467. // UTF-16, little endian
  468. SourceFormat = ETF_UTF16_LE;
  469. convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
  470. }
  471. else
  472. if (size >= 3 && data8[0] == UTF8[0] && data8[1] == UTF8[1] && data8[2] == UTF8[2])
  473. {
  474. // UTF-8
  475. SourceFormat = ETF_UTF8;
  476. convertTextData(data8+3, data8, size); // data8+3 because we need to skip the header
  477. }
  478. else
  479. {
  480. // ASCII
  481. SourceFormat = ETF_ASCII;
  482. convertTextData(data8, data8, size);
  483. }
  484. return true;
  485. }
  486. //! converts the text file into the desired format.
  487. //! \param source: begin of the text (without byte order mark)
  488. //! \param pointerToStore: pointer to text data block which can be
  489. //! stored or deleted based on the nesessary conversion.
  490. //! \param sizeWithoutHeader: Text size in characters without header
  491. template<class src_char_type>
  492. void convertTextData(src_char_type* source, char* pointerToStore, int sizeWithoutHeader)
  493. {
  494. // convert little to big endian if necessary
  495. if (sizeof(src_char_type) > 1 &&
  496. isLittleEndian(TargetFormat) != isLittleEndian(SourceFormat))
  497. convertToLittleEndian(source);
  498. // check if conversion is necessary:
  499. if (sizeof(src_char_type) == sizeof(char_type))
  500. {
  501. // no need to convert
  502. TextBegin = (char_type*)source;
  503. TextData = (char_type*)pointerToStore;
  504. TextSize = sizeWithoutHeader;
  505. }
  506. else
  507. {
  508. // convert source into target data format.
  509. // TODO: implement a real conversion. This one just
  510. // copies bytes. This is a problem when there are
  511. // unicode symbols using more than one character.
  512. TextData = new char_type[sizeWithoutHeader];
  513. // MSVC debugger complains here about loss of data ...
  514. // FIXME - gcc complains about 'shift width larger than width of type'
  515. // for T == unsigned long. Avoid it by messing around volatile ..
  516. volatile unsigned int c = 3;
  517. const src_char_type cc = (src_char_type)((((uint64_t)1u << (sizeof( char_type)<<c)) - 1));
  518. for (int i=0; i<sizeWithoutHeader; ++i)
  519. TextData[i] = char_type( source[i] & cc);
  520. TextBegin = TextData;
  521. TextSize = sizeWithoutHeader;
  522. // delete original data because no longer needed
  523. delete [] pointerToStore;
  524. }
  525. }
  526. //! converts whole text buffer to little endian
  527. template<class src_char_type>
  528. void convertToLittleEndian(src_char_type* t)
  529. {
  530. if (sizeof(src_char_type) == 4)
  531. {
  532. // 32 bit
  533. while(*t)
  534. {
  535. *t = ((*t & 0xff000000) >> 24) |
  536. ((*t & 0x00ff0000) >> 8) |
  537. ((*t & 0x0000ff00) << 8) |
  538. ((*t & 0x000000ff) << 24);
  539. ++t;
  540. }
  541. }
  542. else
  543. {
  544. // 16 bit
  545. while(*t)
  546. {
  547. *t = (*t >> 8) | (*t << 8);
  548. ++t;
  549. }
  550. }
  551. }
  552. //! returns if a format is little endian
  553. inline bool isLittleEndian(ETEXT_FORMAT f)
  554. {
  555. return f == ETF_ASCII ||
  556. f == ETF_UTF8 ||
  557. f == ETF_UTF16_LE ||
  558. f == ETF_UTF32_LE;
  559. }
  560. //! returns true if a character is whitespace
  561. inline bool isWhiteSpace(char_type c)
  562. {
  563. return (c==' ' || c=='\t' || c=='\n' || c=='\r');
  564. }
  565. //! generates a list with xml special characters
  566. void createSpecialCharacterList()
  567. {
  568. // list of strings containing special symbols,
  569. // the first character is the special character,
  570. // the following is the symbol string without trailing &.
  571. SpecialCharacters.push_back("&amp;");
  572. SpecialCharacters.push_back("<lt;");
  573. SpecialCharacters.push_back(">gt;");
  574. SpecialCharacters.push_back("\"quot;");
  575. SpecialCharacters.push_back("'apos;");
  576. }
  577. //! compares the first n characters of the strings
  578. bool equalsn(const char_type* str1, const char_type* str2, int len)
  579. {
  580. int i;
  581. for(i=0; str1[i] && str2[i] && i < len; ++i)
  582. if (str1[i] != str2[i])
  583. return false;
  584. // if one (or both) of the strings was smaller then they
  585. // are only equal if they have the same lenght
  586. return (i == len) || (str1[i] == 0 && str2[i] == 0);
  587. }
  588. //! stores the target text format
  589. void storeTargetFormat()
  590. {
  591. // get target format. We could have done this using template specialization,
  592. // but VisualStudio 6 don't like it and we want to support it.
  593. switch(sizeof(char_type))
  594. {
  595. case 1:
  596. TargetFormat = ETF_UTF8;
  597. break;
  598. case 2:
  599. TargetFormat = ETF_UTF16_LE;
  600. break;
  601. case 4:
  602. TargetFormat = ETF_UTF32_LE;
  603. break;
  604. default:
  605. TargetFormat = ETF_ASCII; // should never happen.
  606. }
  607. }
  608. // instance variables:
  609. char_type* TextData; // data block of the text file
  610. char_type* P; // current point in text to parse
  611. char_type* TextBegin; // start of text to parse
  612. unsigned int TextSize; // size of text to parse in characters, not bytes
  613. EXML_NODE CurrentNodeType; // type of the currently parsed node
  614. ETEXT_FORMAT SourceFormat; // source format of the xml file
  615. ETEXT_FORMAT TargetFormat; // output format of this parser
  616. core::string<char_type> NodeName; // name of the node currently in
  617. core::string<char_type> EmptyString; // empty string to be returned by getSafe() methods
  618. bool IsEmptyElement; // is the currently parsed node empty?
  619. core::array< core::string<char_type> > SpecialCharacters; // see createSpecialCharacterList()
  620. core::array<SAttribute> Attributes; // attributes of current element
  621. }; // end CXMLReaderImpl
  622. } // end namespace
  623. } // end namespace
  624. #endif