textEncoder.cxx 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. // Filename: textEncoder.cxx
  2. // Created by: drose (26Mar03)
  3. //
  4. ////////////////////////////////////////////////////////////////////
  5. //
  6. // PANDA 3D SOFTWARE
  7. // Copyright (c) 2001 - 2004, Disney Enterprises, Inc. All rights reserved
  8. //
  9. // All use of this software is subject to the terms of the Panda 3d
  10. // Software license. You should have received a copy of this license
  11. // along with this source code; you will also find a current copy of
  12. // the license at http://etc.cmu.edu/panda3d/docs/license/ .
  13. //
  14. // To contact the maintainers of this program write to
  15. // [email protected] .
  16. //
  17. ////////////////////////////////////////////////////////////////////
  18. #include "textEncoder.h"
  19. #include "stringDecoder.h"
  20. #include "unicodeLatinMap.h"
  21. TypeHandle TextEncoder::_type_handle;
  22. TextEncoder::Encoding TextEncoder::_default_encoding;
  23. ////////////////////////////////////////////////////////////////////
  24. // Function: TextEncoder::make_upper
  25. // Access: Published
  26. // Description: Adjusts the text stored within the encoder to all
  27. // uppercase letters (preserving accent marks
  28. // correctly).
  29. ////////////////////////////////////////////////////////////////////
  30. void TextEncoder::
  31. make_upper() {
  32. get_wtext();
  33. wstring::iterator si;
  34. for (si = _wtext.begin(); si != _wtext.end(); ++si) {
  35. (*si) = unicode_toupper(*si);
  36. }
  37. _flags &= ~F_got_text;
  38. }
  39. ////////////////////////////////////////////////////////////////////
  40. // Function: TextEncoder::make_lower
  41. // Access: Published
  42. // Description: Adjusts the text stored within the encoder to all
  43. // lowercase letters (preserving accent marks
  44. // correctly).
  45. ////////////////////////////////////////////////////////////////////
  46. void TextEncoder::
  47. make_lower() {
  48. get_wtext();
  49. wstring::iterator si;
  50. for (si = _wtext.begin(); si != _wtext.end(); ++si) {
  51. (*si) = unicode_tolower(*si);
  52. }
  53. _flags &= ~F_got_text;
  54. }
  55. ////////////////////////////////////////////////////////////////////
  56. // Function: TextEncoder::get_wtext_as_ascii
  57. // Access: Public
  58. // Description: Returns the text associated with the node, converted
  59. // as nearly as possible to a fully-ASCII
  60. // representation. This means replacing accented
  61. // letters with their unaccented ASCII equivalents.
  62. //
  63. // It is possible that some characters in the string
  64. // cannot be converted to ASCII. (The string may
  65. // involve symbols like the copyright symbol, for
  66. // instance, or it might involve letters in some other
  67. // alphabet such as Greek or Cyrillic, or even Latin
  68. // letters like thorn or eth that are not part of the
  69. // ASCII character set.) In this case, as much of the
  70. // string as possible will be converted to ASCII, and
  71. // the nonconvertible characters will remain in their
  72. // original form.
  73. ////////////////////////////////////////////////////////////////////
  74. wstring TextEncoder::
  75. get_wtext_as_ascii() const {
  76. get_wtext();
  77. wstring result;
  78. wstring::const_iterator si;
  79. for (si = _wtext.begin(); si != _wtext.end(); ++si) {
  80. wchar_t character = (*si);
  81. const UnicodeLatinMap::Entry *map_entry =
  82. UnicodeLatinMap::look_up(character);
  83. if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
  84. result += (wchar_t)map_entry->_ascii_equiv;
  85. if (map_entry->_ascii_additional != 0) {
  86. result += (wchar_t)map_entry->_ascii_additional;
  87. }
  88. } else {
  89. result += character;
  90. }
  91. }
  92. return result;
  93. }
  94. ////////////////////////////////////////////////////////////////////
  95. // Function: TextEncoder::encode_wchar
  96. // Access: Public, Static
  97. // Description: Encodes a single wide char into a one-, two-, or
  98. // three-byte string, according to the given encoding
  99. // system.
  100. ////////////////////////////////////////////////////////////////////
  101. string TextEncoder::
  102. encode_wchar(wchar_t ch, TextEncoder::Encoding encoding) {
  103. switch (encoding) {
  104. case E_iso8859:
  105. if (ch < 0x100) {
  106. return string(1, (char)ch);
  107. } else {
  108. // The character won't fit in the 8-bit ISO 8859. See if we can
  109. // make it fit by reducing it to its ascii equivalent
  110. // (essentially stripping off an unusual accent mark).
  111. const UnicodeLatinMap::Entry *map_entry =
  112. UnicodeLatinMap::look_up(ch);
  113. if (map_entry != NULL && map_entry->_ascii_equiv != 0) {
  114. // Yes, it has an ascii equivalent.
  115. if (map_entry->_ascii_additional != 0) {
  116. // In fact, it has two of them.
  117. return
  118. string(1, map_entry->_ascii_equiv) +
  119. string(1, map_entry->_ascii_additional);
  120. }
  121. return string(1, map_entry->_ascii_equiv);
  122. }
  123. // Nope; return "." for lack of anything better.
  124. return ".";
  125. }
  126. case E_utf8:
  127. if (ch < 0x80) {
  128. return string(1, (char)ch);
  129. } else if (ch < 0x800) {
  130. return
  131. string(1, (char)((ch >> 6) | 0xc0)) +
  132. string(1, (char)((ch & 0x3f) | 0x80));
  133. } else {
  134. return
  135. string(1, (char)((ch >> 12) | 0xe0)) +
  136. string(1, (char)(((ch >> 6) & 0x3f) | 0x80)) +
  137. string(1, (char)((ch & 0x3f) | 0x80));
  138. }
  139. case E_unicode:
  140. return
  141. string(1, (char)(ch >> 8)) +
  142. string(1, (char)(ch & 0xff));
  143. }
  144. return "";
  145. }
  146. ////////////////////////////////////////////////////////////////////
  147. // Function: TextEncoder::encode_wtext
  148. // Access: Public, Static
  149. // Description: Encodes a wide-text string into a single-char string,
  150. // according to the given encoding.
  151. ////////////////////////////////////////////////////////////////////
  152. string TextEncoder::
  153. encode_wtext(const wstring &wtext, TextEncoder::Encoding encoding) {
  154. string result;
  155. for (wstring::const_iterator pi = wtext.begin(); pi != wtext.end(); ++pi) {
  156. result += encode_wchar(*pi, encoding);
  157. }
  158. return result;
  159. }
  160. ////////////////////////////////////////////////////////////////////
  161. // Function: TextEncoder::decode_text
  162. // Access: Public, Static
  163. // Description: Returns the given wstring decoded to a single-byte
  164. // string, via the given encoding system.
  165. ////////////////////////////////////////////////////////////////////
  166. wstring TextEncoder::
  167. decode_text(const string &text, TextEncoder::Encoding encoding) {
  168. switch (encoding) {
  169. case E_utf8:
  170. {
  171. StringUtf8Decoder decoder(text);
  172. return decode_text_impl(decoder);
  173. }
  174. case E_unicode:
  175. {
  176. StringUnicodeDecoder decoder(text);
  177. return decode_text_impl(decoder);
  178. }
  179. case E_iso8859:
  180. default:
  181. {
  182. StringDecoder decoder(text);
  183. return decode_text_impl(decoder);
  184. }
  185. };
  186. }
  187. ////////////////////////////////////////////////////////////////////
  188. // Function: TextEncoder::decode_text_impl
  189. // Access: Private, Static
  190. // Description: Decodes the eight-bit stream from the indicated
  191. // decoder, returning the decoded wide-char string.
  192. ////////////////////////////////////////////////////////////////////
  193. wstring TextEncoder::
  194. decode_text_impl(StringDecoder &decoder) {
  195. wstring result;
  196. // bool expand_amp = get_expand_amp();
  197. wchar_t character = decoder.get_next_character();
  198. while (!decoder.is_eof()) {
  199. /*
  200. if (character == '&' && expand_amp) {
  201. // An ampersand in expand_amp mode is treated as an escape
  202. // character.
  203. character = expand_amp_sequence(decoder);
  204. }
  205. */
  206. result += character;
  207. character = decoder.get_next_character();
  208. }
  209. return result;
  210. }
  211. /*
  212. ////////////////////////////////////////////////////////////////////
  213. // Function: TextEncoder::expand_amp_sequence
  214. // Access: Private
  215. // Description: Given that we have just read an ampersand from the
  216. // StringDecoder, and that we have expand_amp in effect
  217. // and are therefore expected to expand the sequence
  218. // that this ampersand begins into a single unicode
  219. // character, do the expansion and return the character.
  220. ////////////////////////////////////////////////////////////////////
  221. int TextEncoder::
  222. expand_amp_sequence(StringDecoder &decoder) const {
  223. int result = 0;
  224. int character = decoder.get_next_character();
  225. if (!decoder.is_eof() && character == '#') {
  226. // An explicit numeric sequence: &#nnn;
  227. result = 0;
  228. character = decoder.get_next_character();
  229. while (!decoder.is_eof() && character < 128 && isdigit((unsigned int)character)) {
  230. result = (result * 10) + (character - '0');
  231. character = decoder.get_next_character();
  232. }
  233. if (character != ';') {
  234. // Invalid sequence.
  235. return 0;
  236. }
  237. return result;
  238. }
  239. string sequence;
  240. // Some non-numeric sequence.
  241. while (!decoder.is_eof() && character < 128 && isalpha((unsigned int)character)) {
  242. sequence += character;
  243. character = decoder.get_next_character();
  244. }
  245. if (character != ';') {
  246. // Invalid sequence.
  247. return 0;
  248. }
  249. static const struct {
  250. const char *name;
  251. int code;
  252. } tokens[] = {
  253. { "amp", '&' }, { "lt", '<' }, { "gt", '>' }, { "quot", '"' },
  254. { "nbsp", ' ' },
  255. { "iexcl", 161 }, { "cent", 162 }, { "pound", 163 }, { "curren", 164 },
  256. { "yen", 165 }, { "brvbar", 166 }, { "brkbar", 166 }, { "sect", 167 },
  257. { "uml", 168 }, { "die", 168 }, { "copy", 169 }, { "ordf", 170 },
  258. { "laquo", 171 }, { "not", 172 }, { "shy", 173 }, { "reg", 174 },
  259. { "macr", 175 }, { "hibar", 175 }, { "deg", 176 }, { "plusmn", 177 },
  260. { "sup2", 178 }, { "sup3", 179 }, { "acute", 180 }, { "micro", 181 },
  261. { "para", 182 }, { "middot", 183 }, { "cedil", 184 }, { "sup1", 185 },
  262. { "ordm", 186 }, { "raquo", 187 }, { "frac14", 188 }, { "frac12", 189 },
  263. { "frac34", 190 }, { "iquest", 191 }, { "Agrave", 192 }, { "Aacute", 193 },
  264. { "Acirc", 194 }, { "Atilde", 195 }, { "Auml", 196 }, { "Aring", 197 },
  265. { "AElig", 198 }, { "Ccedil", 199 }, { "Egrave", 200 }, { "Eacute", 201 },
  266. { "Ecirc", 202 }, { "Euml", 203 }, { "Igrave", 204 }, { "Iacute", 205 },
  267. { "Icirc", 206 }, { "Iuml", 207 }, { "ETH", 208 }, { "Dstrok", 208 },
  268. { "Ntilde", 209 }, { "Ograve", 210 }, { "Oacute", 211 }, { "Ocirc", 212 },
  269. { "Otilde", 213 }, { "Ouml", 214 }, { "times", 215 }, { "Oslash", 216 },
  270. { "Ugrave", 217 }, { "Uacute", 218 }, { "Ucirc", 219 }, { "Uuml", 220 },
  271. { "Yacute", 221 }, { "THORN", 222 }, { "szlig", 223 }, { "agrave", 224 },
  272. { "aacute", 225 }, { "acirc", 226 }, { "atilde", 227 }, { "auml", 228 },
  273. { "aring", 229 }, { "aelig", 230 }, { "ccedil", 231 }, { "egrave", 232 },
  274. { "eacute", 233 }, { "ecirc", 234 }, { "euml", 235 }, { "igrave", 236 },
  275. { "iacute", 237 }, { "icirc", 238 }, { "iuml", 239 }, { "eth", 240 },
  276. { "ntilde", 241 }, { "ograve", 242 }, { "oacute", 243 }, { "ocirc", 244 },
  277. { "otilde", 245 }, { "ouml", 246 }, { "divide", 247 }, { "oslash", 248 },
  278. { "ugrave", 249 }, { "uacute", 250 }, { "ucirc", 251 }, { "uuml", 252 },
  279. { "yacute", 253 }, { "thorn", 254 }, { "yuml", 255 },
  280. { NULL, 0 },
  281. };
  282. for (int i = 0; tokens[i].name != NULL; i++) {
  283. if (sequence == tokens[i].name) {
  284. // Here's a match.
  285. return tokens[i].code;
  286. }
  287. }
  288. // Some unrecognized sequence.
  289. return 0;
  290. }
  291. */