Unicode.h 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275
  1. //
  2. // Unicode.h
  3. //
  4. // $Id: //poco/1.4/Foundation/include/Poco/Unicode.h#2 $
  5. //
  6. // Library: Foundation
  7. // Package: Text
  8. // Module: Unicode
  9. //
  10. // Definition of the Unicode class.
  11. //
  12. // Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
  13. // and Contributors.
  14. //
  15. // SPDX-License-Identifier: BSL-1.0
  16. //
  17. #ifndef Foundation_Unicode_INCLUDED
  18. #define Foundation_Unicode_INCLUDED
  19. #include "Poco/Foundation.h"
  20. namespace Poco {
  21. class Foundation_API Unicode
  22. /// This class contains enumerations and static
  23. /// utility functions for dealing with Unicode characters
  24. /// and their properties.
  25. ///
  26. /// For more information on Unicode, see <http://www.unicode.org>.
  27. ///
  28. /// The implementation is based on the Unicode support
  29. /// functions in PCRE.
  30. {
  31. public:
  32. // Implementation note: the following definitions must be kept
  33. // in sync with those from ucp.h (PCRE).
  34. enum CharacterCategory
  35. /// Unicode 5.0 character categories.
  36. {
  37. UCP_OTHER,
  38. UCP_LETTER,
  39. UCP_MARK,
  40. UCP_NUMBER,
  41. UCP_PUNCTUATION,
  42. UCP_SYMBOL,
  43. UCP_SEPARATOR
  44. };
  45. enum CharacterType
  46. /// Unicode 5.0 character types.
  47. {
  48. UCP_CONTROL,
  49. UCP_FORMAT,
  50. UCP_UNASSIGNED,
  51. UCP_PRIVATE_USE,
  52. UCP_SURROGATE,
  53. UCP_LOWER_CASE_LETTER,
  54. UCP_MODIFIER_LETTER,
  55. UCP_OTHER_LETTER,
  56. UCP_TITLE_CASE_LETTER,
  57. UCP_UPPER_CASE_LETTER,
  58. UCP_SPACING_MARK,
  59. UCP_ENCLOSING_MARK,
  60. UCP_NON_SPACING_MARK,
  61. UCP_DECIMAL_NUMBER,
  62. UCP_LETTER_NUMBER,
  63. UCP_OTHER_NUMBER,
  64. UCP_CONNECTOR_PUNCTUATION,
  65. UCP_DASH_PUNCTUATION,
  66. UCP_CLOSE_PUNCTUATION,
  67. UCP_FINAL_PUNCTUATION,
  68. UCP_INITIAL_PUNCTUATION,
  69. UCP_OTHER_PUNCTUATION,
  70. UCP_OPEN_PUNCTUATION,
  71. UCP_CURRENCY_SYMBOL,
  72. UCP_MODIFIER_SYMBOL,
  73. UCP_MATHEMATICAL_SYMBOL,
  74. UCP_OTHER_SYMBOL,
  75. UCP_LINE_SEPARATOR,
  76. UCP_PARAGRAPH_SEPARATOR,
  77. UCP_SPACE_SEPARATOR
  78. };
  79. enum Script
  80. /// Unicode 5.0 scripts.
  81. {
  82. UCP_ARABIC,
  83. UCP_ARMENIAN,
  84. UCP_BENGALI,
  85. UCP_BOPOMOFO,
  86. UCP_BRAILLE,
  87. UCP_BUGINESE,
  88. UCP_BUHID,
  89. UCP_CANADIAN_ABORIGINAL,
  90. UCP_CHEROKEE,
  91. UCP_COMMON,
  92. UCP_COPTIC,
  93. UCP_CYPRIOT,
  94. UCP_CYRILLIC,
  95. UCP_DESERET,
  96. UCP_DEVANAGARI,
  97. UCP_ETHIOPIC,
  98. UCP_GEORGIAN,
  99. UCP_GLAGOLITIC,
  100. UCP_GOTHIC,
  101. UCP_GREEK,
  102. UCP_GUJARATI,
  103. UCP_GURMUKHI,
  104. UCP_HAN,
  105. UCP_HANGUL,
  106. UCP_HANUNOO,
  107. UCP_HEBREW,
  108. UCP_HIRAGANA,
  109. UCP_INHERITED,
  110. UCP_KANNADA,
  111. UCP_KATAKANA,
  112. UCP_KHAROSHTHI,
  113. UCP_KHMER,
  114. UCP_LAO,
  115. UCP_LATIN,
  116. UCP_LIMBU,
  117. UCP_LINEAR_B,
  118. UCP_MALAYALAM,
  119. UCP_MONGOLIAN,
  120. UCP_MYANMAR,
  121. UCP_NEW_TAI_LUE,
  122. UCP_OGHAM,
  123. UCP_OLD_ITALIC,
  124. UCP_OLD_PERSIAN,
  125. UCP_ORIYA,
  126. UCP_OSMANYA,
  127. UCP_RUNIC,
  128. UCP_SHAVIAN,
  129. UCP_SINHALA,
  130. UCP_SYLOTI_NAGRI,
  131. UCP_SYRIAC,
  132. UCP_TAGALOG,
  133. UCP_TAGBANWA,
  134. UCP_TAI_LE,
  135. UCP_TAMIL,
  136. UCP_TELUGU,
  137. UCP_THAANA,
  138. UCP_THAI,
  139. UCP_TIBETAN,
  140. UCP_TIFINAGH,
  141. UCP_UGARITIC,
  142. UCP_YI,
  143. UCP_BALINESE,
  144. UCP_CUNEIFORM,
  145. UCP_NKO,
  146. UCP_PHAGS_PA,
  147. UCP_PHOENICIAN,
  148. UCP_CARIAN,
  149. UCP_CHAM,
  150. UCP_KAYAH_LI,
  151. UCP_LEPCHA,
  152. UCP_LYCIAN,
  153. UCP_LYDIAN,
  154. UCP_OL_CHIKI,
  155. UCP_REJANG,
  156. UCP_SAURASHTRA,
  157. UCP_SUNDANESE,
  158. UCP_VAI
  159. };
  160. enum
  161. {
  162. UCP_MAX_CODEPOINT = 0x10FFFF
  163. };
  164. struct CharacterProperties
  165. /// This structure holds the character properties
  166. /// of an Unicode character.
  167. {
  168. CharacterCategory category;
  169. CharacterType type;
  170. Script script;
  171. };
  172. static void properties(int ch, CharacterProperties& props);
  173. /// Return the Unicode character properties for the
  174. /// character with the given Unicode value.
  175. static bool isSpace(int ch);
  176. /// Returns true iff the given character is a separator.
  177. static bool isDigit(int ch);
  178. /// Returns true iff the given character is a numeric character.
  179. static bool isPunct(int ch);
  180. /// Returns true iff the given character is a punctuation character.
  181. static bool isAlpha(int ch);
  182. /// Returns true iff the given character is a letter.
  183. static bool isLower(int ch);
  184. /// Returns true iff the given character is a lowercase
  185. /// character.
  186. static bool isUpper(int ch);
  187. /// Returns true iff the given character is an uppercase
  188. /// character.
  189. static int toLower(int ch);
  190. /// If the given character is an uppercase character,
  191. /// return its lowercase counterpart, otherwise return
  192. /// the character.
  193. static int toUpper(int ch);
  194. /// If the given character is a lowercase character,
  195. /// return its uppercase counterpart, otherwise return
  196. /// the character.
  197. };
  198. //
  199. // inlines
  200. //
  201. inline bool Unicode::isSpace(int ch)
  202. {
  203. CharacterProperties props;
  204. properties(ch, props);
  205. return props.category == UCP_SEPARATOR;
  206. }
  207. inline bool Unicode::isDigit(int ch)
  208. {
  209. CharacterProperties props;
  210. properties(ch, props);
  211. return props.category == UCP_NUMBER;
  212. }
  213. inline bool Unicode::isPunct(int ch)
  214. {
  215. CharacterProperties props;
  216. properties(ch, props);
  217. return props.category == UCP_PUNCTUATION;
  218. }
  219. inline bool Unicode::isAlpha(int ch)
  220. {
  221. CharacterProperties props;
  222. properties(ch, props);
  223. return props.category == UCP_LETTER;
  224. }
  225. inline bool Unicode::isLower(int ch)
  226. {
  227. CharacterProperties props;
  228. properties(ch, props);
  229. return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
  230. }
  231. inline bool Unicode::isUpper(int ch)
  232. {
  233. CharacterProperties props;
  234. properties(ch, props);
  235. return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
  236. }
  237. } // namespace Poco
  238. #endif // Foundation_Unicode_INCLUDED