CXMLReaderImpl.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806
  1. // Copyright (C) 2002-2005 Nikolaus Gebhardt
  2. // This file is part of the "Irrlicht Engine" and the "irrXML" project.
  3. // For conditions of distribution and use, see copyright notice in irrlicht.h and/or irrXML.h
  4. #ifndef __ICXML_READER_IMPL_H_INCLUDED__
  5. #define __ICXML_READER_IMPL_H_INCLUDED__
  6. #include "irrXML.h"
  7. #include "irrString.h"
  8. #include "irrArray.h"
  9. #include "fast_atof.h"
  10. #ifdef _DEBUG
  11. #define IRR_DEBUGPRINT(x) printf((x));
  12. #else // _DEBUG
  13. #define IRR_DEBUGPRINT(x)
  14. #endif // _DEBUG
  15. namespace irr
  16. {
  17. namespace io
  18. {
  19. //! implementation of the IrrXMLReader
  20. template<class char_type, class superclass>
  21. class CXMLReaderImpl : public IIrrXMLReader<char_type, superclass>
  22. {
  23. public:
  24. //! Constructor
  25. CXMLReaderImpl(IFileReadCallBack* callback, bool deleteCallBack = true)
  26. : TextData(0)
  27. , P(0)
  28. , TextBegin(0)
  29. , TextSize(0)
  30. , CurrentNodeType(EXN_NONE)
  31. , SourceFormat(ETF_ASCII)
  32. , TargetFormat(ETF_ASCII)
  33. , NodeName ()
  34. , EmptyString()
  35. , IsEmptyElement(false)
  36. , SpecialCharacters()
  37. , Attributes() {
  38. if (!callback) {
  39. return;
  40. }
  41. storeTargetFormat();
  42. // read whole xml file
  43. readFile(callback);
  44. // clean up
  45. if (deleteCallBack)
  46. delete callback;
  47. // create list with special characters
  48. createSpecialCharacterList();
  49. // set pointer to text begin
  50. P = TextBegin;
  51. }
  52. //! Destructor
  53. virtual ~CXMLReaderImpl()
  54. {
  55. delete [] TextData;
  56. }
  57. //! Reads forward to the next xml node.
  58. //! \return Returns false, if there was no further node.
  59. virtual bool read()
  60. {
  61. // if not end reached, parse the node
  62. if (P && (unsigned int)(P - TextBegin) < TextSize - 1 && *P != 0)
  63. {
  64. parseCurrentNode();
  65. return true;
  66. }
  67. _IRR_IMPLEMENT_MANAGED_MARSHALLING_BUGFIX;
  68. return false;
  69. }
  70. //! Returns the type of the current XML node.
  71. virtual EXML_NODE getNodeType() const
  72. {
  73. return CurrentNodeType;
  74. }
  75. //! Returns attribute count of the current XML node.
  76. virtual int getAttributeCount() const
  77. {
  78. return Attributes.size();
  79. }
  80. //! Returns name of an attribute.
  81. virtual const char_type* getAttributeName(int idx) const
  82. {
  83. if (idx < 0 || idx >= (int)Attributes.size())
  84. return 0;
  85. return Attributes[idx].Name.c_str();
  86. }
  87. //! Returns the value of an attribute.
  88. virtual const char_type* getAttributeValue(int idx) const
  89. {
  90. if (idx < 0 || idx >= (int)Attributes.size())
  91. return 0;
  92. return Attributes[idx].Value.c_str();
  93. }
  94. //! Returns the value of an attribute.
  95. virtual const char_type* getAttributeValue(const char_type* name) const
  96. {
  97. const SAttribute* attr = getAttributeByName(name);
  98. if (!attr)
  99. return 0;
  100. return attr->Value.c_str();
  101. }
  102. //! Returns the value of an attribute
  103. virtual const char_type* getAttributeValueSafe(const char_type* name) const
  104. {
  105. const SAttribute* attr = getAttributeByName(name);
  106. if (!attr)
  107. return EmptyString.c_str();
  108. return attr->Value.c_str();
  109. }
  110. //! Returns the value of an attribute as integer.
  111. int getAttributeValueAsInt(const char_type* name) const
  112. {
  113. return (int)getAttributeValueAsFloat(name);
  114. }
  115. //! Returns the value of an attribute as integer.
  116. int getAttributeValueAsInt(int idx) const
  117. {
  118. return (int)getAttributeValueAsFloat(idx);
  119. }
  120. //! Returns the value of an attribute as float.
  121. float getAttributeValueAsFloat(const char_type* name) const
  122. {
  123. const SAttribute* attr = getAttributeByName(name);
  124. if (!attr)
  125. return 0;
  126. core::stringc c = attr->Value.c_str();
  127. return core::fast_atof(c.c_str());
  128. }
  129. //! Returns the value of an attribute as float.
  130. float getAttributeValueAsFloat(int idx) const
  131. {
  132. const char_type* attrvalue = getAttributeValue(idx);
  133. if (!attrvalue)
  134. return 0;
  135. core::stringc c = attrvalue;
  136. return core::fast_atof(c.c_str());
  137. }
  138. //! Returns the name of the current node.
  139. virtual const char_type* getNodeName() const
  140. {
  141. return NodeName.c_str();
  142. }
  143. //! Returns data of the current node.
  144. virtual const char_type* getNodeData() const
  145. {
  146. return NodeName.c_str();
  147. }
  148. //! Returns if an element is an empty element, like <foo />
  149. virtual bool isEmptyElement() const
  150. {
  151. return IsEmptyElement;
  152. }
  153. //! Returns format of the source xml file.
  154. virtual ETEXT_FORMAT getSourceFormat() const
  155. {
  156. return SourceFormat;
  157. }
  158. //! Returns format of the strings returned by the parser.
  159. virtual ETEXT_FORMAT getParserFormat() const
  160. {
  161. return TargetFormat;
  162. }
  163. private:
  164. // Reads the current xml node
  165. void parseCurrentNode()
  166. {
  167. char_type* start = P;
  168. // more forward until '<' found
  169. while(*P != L'<' && *P)
  170. ++P;
  171. if (!*P)
  172. return;
  173. if (P - start > 0)
  174. {
  175. // we found some text, store it
  176. if (setText(start, P))
  177. return;
  178. }
  179. ++P;
  180. // based on current token, parse and report next element
  181. switch(*P)
  182. {
  183. case L'/':
  184. parseClosingXMLElement();
  185. break;
  186. case L'?':
  187. ignoreDefinition();
  188. break;
  189. case L'!':
  190. if (!parseCDATA())
  191. parseComment();
  192. break;
  193. default:
  194. parseOpeningXMLElement();
  195. break;
  196. }
  197. }
  198. //! sets the state that text was found. Returns true if set should be set
  199. bool setText(char_type* start, char_type* end)
  200. {
  201. // check if text is more than 2 characters, and if not, check if there is
  202. // only white space, so that this text won't be reported
  203. if (end - start < 3)
  204. {
  205. char_type* p = start;
  206. for(; p != end; ++p)
  207. if (!isWhiteSpace(*p))
  208. break;
  209. if (p == end)
  210. return false;
  211. }
  212. // set current text to the parsed text, and replace xml special characters
  213. core::string<char_type> s(start, (int)(end - start));
  214. NodeName = replaceSpecialCharacters(s);
  215. // current XML node type is text
  216. CurrentNodeType = EXN_TEXT;
  217. return true;
  218. }
  219. //! ignores an xml definition like <?xml something />
  220. void ignoreDefinition()
  221. {
  222. CurrentNodeType = EXN_UNKNOWN;
  223. // move until end marked with '>' reached
  224. while(*P != L'>')
  225. ++P;
  226. ++P;
  227. }
  228. //! parses a comment
  229. void parseComment()
  230. {
  231. CurrentNodeType = EXN_COMMENT;
  232. P += 1;
  233. char_type *pCommentBegin = P;
  234. int count = 1;
  235. // move until end of comment reached
  236. while(count)
  237. {
  238. if (*P == L'>')
  239. --count;
  240. else
  241. if (*P == L'<')
  242. ++count;
  243. ++P;
  244. }
  245. P -= 3;
  246. NodeName = core::string<char_type>(pCommentBegin+2, (int)(P - pCommentBegin-2));
  247. P += 3;
  248. }
  249. //! parses an opening xml element and reads attributes
  250. void parseOpeningXMLElement()
  251. {
  252. CurrentNodeType = EXN_ELEMENT;
  253. IsEmptyElement = false;
  254. Attributes.clear();
  255. // find name
  256. const char_type* startName = P;
  257. // find end of element
  258. while(*P != L'>' && !isWhiteSpace(*P))
  259. ++P;
  260. const char_type* endName = P;
  261. // find Attributes
  262. while(*P != L'>')
  263. {
  264. if (isWhiteSpace(*P))
  265. ++P;
  266. else
  267. {
  268. if (*P != L'/')
  269. {
  270. // we've got an attribute
  271. // read the attribute names
  272. const char_type* attributeNameBegin = P;
  273. while(!isWhiteSpace(*P) && *P != L'=')
  274. ++P;
  275. const char_type* attributeNameEnd = P;
  276. ++P;
  277. // read the attribute value
  278. // check for quotes and single quotes, thx to murphy
  279. while( (*P != L'\"') && (*P != L'\'') && *P)
  280. ++P;
  281. if (!*P) // malformatted xml file
  282. return;
  283. const char_type attributeQuoteChar = *P;
  284. ++P;
  285. const char_type* attributeValueBegin = P;
  286. while(*P != attributeQuoteChar && *P)
  287. ++P;
  288. if (!*P) // malformatted xml file
  289. return;
  290. const char_type* attributeValueEnd = P;
  291. ++P;
  292. SAttribute attr;
  293. attr.Name = core::string<char_type>(attributeNameBegin,
  294. (int)(attributeNameEnd - attributeNameBegin));
  295. core::string<char_type> s(attributeValueBegin,
  296. (int)(attributeValueEnd - attributeValueBegin));
  297. attr.Value = replaceSpecialCharacters(s);
  298. Attributes.push_back(attr);
  299. }
  300. else
  301. {
  302. // tag is closed directly
  303. ++P;
  304. IsEmptyElement = true;
  305. break;
  306. }
  307. }
  308. }
  309. // check if this tag is closing directly
  310. if (endName > startName && *(endName-1) == L'/')
  311. {
  312. // directly closing tag
  313. IsEmptyElement = true;
  314. endName--;
  315. }
  316. NodeName = core::string<char_type>(startName, (int)(endName - startName));
  317. ++P;
  318. }
  319. //! parses an closing xml tag
  320. void parseClosingXMLElement()
  321. {
  322. CurrentNodeType = EXN_ELEMENT_END;
  323. IsEmptyElement = false;
  324. Attributes.clear();
  325. ++P;
  326. const char_type* pBeginClose = P;
  327. while(*P != L'>')
  328. ++P;
  329. NodeName = core::string<char_type>(pBeginClose, (int)(P - pBeginClose));
  330. ++P;
  331. }
  332. //! parses a possible CDATA section, returns false if begin was not a CDATA section
  333. bool parseCDATA()
  334. {
  335. if (*(P+1) != L'[')
  336. return false;
  337. CurrentNodeType = EXN_CDATA;
  338. // skip '<![CDATA['
  339. int count=0;
  340. while( *P && count<8 )
  341. {
  342. ++P;
  343. ++count;
  344. }
  345. if (!*P)
  346. return true;
  347. char_type *cDataBegin = P;
  348. char_type *cDataEnd = 0;
  349. // find end of CDATA
  350. while(*P && !cDataEnd)
  351. {
  352. if (*P == L'>' &&
  353. (*(P-1) == L']') &&
  354. (*(P-2) == L']'))
  355. {
  356. cDataEnd = P - 2;
  357. }
  358. ++P;
  359. }
  360. if ( cDataEnd )
  361. NodeName = core::string<char_type>(cDataBegin, (int)(cDataEnd - cDataBegin));
  362. else
  363. NodeName = "";
  364. return true;
  365. }
  366. // structure for storing attribute-name pairs
  367. struct SAttribute
  368. {
  369. core::string<char_type> Name;
  370. core::string<char_type> Value;
  371. };
  372. // finds a current attribute by name, returns 0 if not found
  373. const SAttribute* getAttributeByName(const char_type* name) const
  374. {
  375. if (!name)
  376. return 0;
  377. core::string<char_type> n = name;
  378. for (int i=0; i<(int)Attributes.size(); ++i)
  379. if (Attributes[i].Name == n)
  380. return &Attributes[i];
  381. return 0;
  382. }
  383. // replaces xml special characters in a string and creates a new one
  384. core::string<char_type> replaceSpecialCharacters(
  385. core::string<char_type>& origstr)
  386. {
  387. int pos = origstr.findFirst(L'&');
  388. int oldPos = 0;
  389. if (pos == -1)
  390. return origstr;
  391. core::string<char_type> newstr;
  392. while(pos != -1 && pos < origstr.size()-2)
  393. {
  394. // check if it is one of the special characters
  395. int specialChar = -1;
  396. for (int i=0; i<(int)SpecialCharacters.size(); ++i)
  397. {
  398. const char_type* p = &origstr.c_str()[pos]+1;
  399. if (equalsn(&SpecialCharacters[i][1], p, SpecialCharacters[i].size()-1))
  400. {
  401. specialChar = i;
  402. break;
  403. }
  404. }
  405. if (specialChar != -1)
  406. {
  407. newstr.append(origstr.subString(oldPos, pos - oldPos));
  408. newstr.append(SpecialCharacters[specialChar][0]);
  409. pos += SpecialCharacters[specialChar].size();
  410. }
  411. else
  412. {
  413. newstr.append(origstr.subString(oldPos, pos - oldPos + 1));
  414. pos += 1;
  415. }
  416. // find next &
  417. oldPos = pos;
  418. pos = origstr.findNext(L'&', pos);
  419. }
  420. if (oldPos < origstr.size()-1)
  421. newstr.append(origstr.subString(oldPos, origstr.size()-oldPos));
  422. return newstr;
  423. }
  424. //! reads the xml file and converts it into the wanted character format.
  425. bool readFile(IFileReadCallBack* callback)
  426. {
  427. int size = callback->getSize();
  428. size += 4; // We need two terminating 0's at the end.
  429. // For ASCII we need 1 0's, for UTF-16 2, for UTF-32 4.
  430. char* data8 = new char[size];
  431. if (!callback->read(data8, size-4))
  432. {
  433. delete [] data8;
  434. return false;
  435. }
  436. // add zeros at end
  437. data8[size-1] = 0;
  438. data8[size-2] = 0;
  439. data8[size-3] = 0;
  440. data8[size-4] = 0;
  441. char16* data16 = reinterpret_cast<char16*>(data8);
  442. char32* data32 = reinterpret_cast<char32*>(data8);
  443. // now we need to convert the data to the desired target format
  444. // based on the byte order mark.
  445. const unsigned char UTF8[] = {0xEF, 0xBB, 0xBF}; // 0xEFBBBF;
  446. const int UTF16_BE = 0xFFFE;
  447. const int UTF16_LE = 0xFEFF;
  448. const int UTF32_BE = 0xFFFE0000;
  449. const int UTF32_LE = 0x0000FEFF;
  450. // check source for all utf versions and convert to target data format
  451. if (size >= 4 && data32[0] == (char32)UTF32_BE)
  452. {
  453. // UTF-32, big endian
  454. SourceFormat = ETF_UTF32_BE;
  455. convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
  456. }
  457. else
  458. if (size >= 4 && data32[0] == (char32)UTF32_LE)
  459. {
  460. // UTF-32, little endian
  461. SourceFormat = ETF_UTF32_LE;
  462. convertTextData(data32+1, data8, (size/4)); // data32+1 because we need to skip the header
  463. }
  464. else
  465. if (size >= 2 && data16[0] == UTF16_BE)
  466. {
  467. // UTF-16, big endian
  468. SourceFormat = ETF_UTF16_BE;
  469. convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
  470. }
  471. else
  472. if (size >= 2 && data16[0] == UTF16_LE)
  473. {
  474. // UTF-16, little endian
  475. SourceFormat = ETF_UTF16_LE;
  476. convertTextData(data16+1, data8, (size/2)); // data16+1 because we need to skip the header
  477. }
  478. else
  479. if (size >= 3 && data8[0] == UTF8[0] && data8[1] == UTF8[1] && data8[2] == UTF8[2])
  480. {
  481. // UTF-8
  482. SourceFormat = ETF_UTF8;
  483. convertTextData(data8+3, data8, size); // data8+3 because we need to skip the header
  484. }
  485. else
  486. {
  487. // ASCII
  488. SourceFormat = ETF_ASCII;
  489. convertTextData(data8, data8, size);
  490. }
  491. return true;
  492. }
  493. //! converts the text file into the desired format.
  494. //! \param source: begin of the text (without byte order mark)
  495. //! \param pointerToStore: pointer to text data block which can be
  496. //! stored or deleted based on the nesessary conversion.
  497. //! \param sizeWithoutHeader: Text size in characters without header
  498. template<class src_char_type>
  499. void convertTextData(src_char_type* source, char* pointerToStore, int sizeWithoutHeader)
  500. {
  501. // convert little to big endian if necessary
  502. if (sizeof(src_char_type) > 1 &&
  503. isLittleEndian(TargetFormat) != isLittleEndian(SourceFormat))
  504. convertToLittleEndian(source);
  505. // check if conversion is necessary:
  506. if (sizeof(src_char_type) == sizeof(char_type))
  507. {
  508. // no need to convert
  509. TextBegin = (char_type*)source;
  510. TextData = (char_type*)pointerToStore;
  511. TextSize = sizeWithoutHeader;
  512. }
  513. else
  514. {
  515. // convert source into target data format.
  516. // TODO: implement a real conversion. This one just
  517. // copies bytes. This is a problem when there are
  518. // unicode symbols using more than one character.
  519. TextData = new char_type[sizeWithoutHeader];
  520. for (int i=0; i<sizeWithoutHeader; ++i)
  521. TextData[i] = (char_type)source[i];
  522. TextBegin = TextData;
  523. TextSize = sizeWithoutHeader;
  524. // delete original data because no longer needed
  525. delete [] pointerToStore;
  526. }
  527. }
  528. //! converts whole text buffer to little endian
  529. template<class src_char_type>
  530. void convertToLittleEndian(src_char_type* t)
  531. {
  532. if (sizeof(src_char_type) == 4)
  533. {
  534. // 32 bit
  535. while(*t)
  536. {
  537. *t = ((*t & 0xff000000) >> 24) |
  538. ((*t & 0x00ff0000) >> 8) |
  539. ((*t & 0x0000ff00) << 8) |
  540. ((*t & 0x000000ff) << 24);
  541. ++t;
  542. }
  543. }
  544. else
  545. {
  546. // 16 bit
  547. while(*t)
  548. {
  549. *t = (*t >> 8) | (*t << 8);
  550. ++t;
  551. }
  552. }
  553. }
  554. //! returns if a format is little endian
  555. inline bool isLittleEndian(ETEXT_FORMAT f)
  556. {
  557. return f == ETF_ASCII ||
  558. f == ETF_UTF8 ||
  559. f == ETF_UTF16_LE ||
  560. f == ETF_UTF32_LE;
  561. }
  562. //! returns true if a character is whitespace
  563. inline bool isWhiteSpace(char_type c)
  564. {
  565. return (c==' ' || c=='\t' || c=='\n' || c=='\r');
  566. }
  567. //! generates a list with xml special characters
  568. void createSpecialCharacterList()
  569. {
  570. // list of strings containing special symbols,
  571. // the first character is the special character,
  572. // the following is the symbol string without trailing &.
  573. SpecialCharacters.push_back("&amp;");
  574. SpecialCharacters.push_back("<lt;");
  575. SpecialCharacters.push_back(">gt;");
  576. SpecialCharacters.push_back("\"quot;");
  577. SpecialCharacters.push_back("'apos;");
  578. }
  579. //! compares the first n characters of the strings
  580. bool equalsn(const char_type* str1, const char_type* str2, int len)
  581. {
  582. int i;
  583. for(i=0; str1[i] && str2[i] && i < len; ++i)
  584. if (str1[i] != str2[i])
  585. return false;
  586. // if one (or both) of the strings was smaller then they
  587. // are only equal if they have the same lenght
  588. return (i == len) || (str1[i] == 0 && str2[i] == 0);
  589. }
  590. //! stores the target text format
  591. void storeTargetFormat()
  592. {
  593. // get target format. We could have done this using template specialization,
  594. // but VisualStudio 6 don't like it and we want to support it.
  595. switch(sizeof(char_type))
  596. {
  597. case 1:
  598. TargetFormat = ETF_UTF8;
  599. break;
  600. case 2:
  601. TargetFormat = ETF_UTF16_LE;
  602. break;
  603. case 4:
  604. TargetFormat = ETF_UTF32_LE;
  605. break;
  606. default:
  607. TargetFormat = ETF_ASCII; // should never happen.
  608. }
  609. }
  610. // instance variables:
  611. char_type* TextData; // data block of the text file
  612. char_type* P; // current point in text to parse
  613. char_type* TextBegin; // start of text to parse
  614. unsigned int TextSize; // size of text to parse in characters, not bytes
  615. EXML_NODE CurrentNodeType; // type of the currently parsed node
  616. ETEXT_FORMAT SourceFormat; // source format of the xml file
  617. ETEXT_FORMAT TargetFormat; // output format of this parser
  618. core::string<char_type> NodeName; // name of the node currently in
  619. core::string<char_type> EmptyString; // empty string to be returned by getSafe() methods
  620. bool IsEmptyElement; // is the currently parsed node empty?
  621. core::array< core::string<char_type> > SpecialCharacters; // see createSpecialCharacterList()
  622. core::array<SAttribute> Attributes; // attributes of current element
  623. }; // end CXMLReaderImpl
  624. } // end namespace
  625. } // end namespace
  626. #endif