category.cs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: category.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. //
  9. // Permission is hereby granted, free of charge, to any person obtaining
  10. // a copy of this software and associated documentation files (the
  11. // "Software"), to deal in the Software without restriction, including
  12. // without limitation the rights to use, copy, modify, merge, publish,
  13. // distribute, sublicense, and/or sell copies of the Software, and to
  14. // permit persons to whom the Software is furnished to do so, subject to
  15. // the following conditions:
  16. //
  17. // The above copyright notice and this permission notice shall be
  18. // included in all copies or substantial portions of the Software.
  19. //
  20. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  21. // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  22. // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  23. // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  24. // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  25. // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  26. // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  27. //
  28. using System;
  29. using System.Globalization;
  30. namespace System.Text.RegularExpressions {
  31. enum Category : ushort {
  32. None,
  33. // canonical classes
  34. Any, // any character except newline .
  35. AnySingleline, // any character . (s option)
  36. Word, // any word character \w
  37. Digit, // any digit character \d
  38. WhiteSpace, // any whitespace character \s
  39. // ECMAScript classes
  40. EcmaAny,
  41. EcmaAnySingleline,
  42. EcmaWord, // [a-zA-Z_0-9]
  43. EcmaDigit, // [0-9]
  44. EcmaWhiteSpace, // [ \f\n\r\t\v]
  45. // unicode categories
  46. UnicodeL, // Letter
  47. UnicodeM, // Mark
  48. UnicodeN, // Number
  49. UnicodeZ, // Separator
  50. UnicodeP, // Punctuation
  51. UnicodeS, // Symbol
  52. UnicodeC, // Other
  53. UnicodeLu, // UppercaseLetter
  54. UnicodeLl, // LowercaseLetter
  55. UnicodeLt, // TitlecaseLetter
  56. UnicodeLm, // ModifierLetter
  57. UnicodeLo, // OtherLetter
  58. UnicodeMn, // NonspacingMark
  59. UnicodeMe, // EnclosingMark
  60. UnicodeMc, // SpacingMark
  61. UnicodeNd, // DecimalNumber
  62. UnicodeNl, // LetterNumber
  63. UnicodeNo, // OtherNumber
  64. UnicodeZs, // SpaceSeparator
  65. UnicodeZl, // LineSeparator
  66. UnicodeZp, // ParagraphSeparator
  67. UnicodePd, // DashPunctuation
  68. UnicodePs, // OpenPunctuation
  69. UnicodePi, // InitialPunctuation
  70. UnicodePe, // ClosePunctuation
  71. UnicodePf, // FinalPunctuation
  72. UnicodePc, // ConnectorPunctuation
  73. UnicodePo, // OtherPunctuation
  74. UnicodeSm, // MathSymbol
  75. UnicodeSc, // CurrencySymbol
  76. UnicodeSk, // ModifierSymbol
  77. UnicodeSo, // OtherSymbol
  78. UnicodeCc, // Control
  79. UnicodeCf, // Format
  80. UnicodeCo, // PrivateUse
  81. UnicodeCs, // Surrogate
  82. UnicodeCn, // Unassigned
  83. // unicode block ranges
  84. // notes: the categories marked with a star are valid unicode block ranges,
  85. // but don't seem to be accepted by the MS parser using the /p{...} format.
  86. // any ideas?
  87. UnicodeBasicLatin,
  88. UnicodeLatin1Supplement, // *
  89. UnicodeLatinExtendedA, // *
  90. UnicodeLatinExtendedB, // *
  91. UnicodeIPAExtensions,
  92. UnicodeSpacingModifierLetters,
  93. UnicodeCombiningDiacriticalMarks,
  94. UnicodeGreek,
  95. UnicodeCyrillic,
  96. UnicodeArmenian,
  97. UnicodeHebrew,
  98. UnicodeArabic,
  99. UnicodeSyriac,
  100. UnicodeThaana,
  101. UnicodeDevanagari,
  102. UnicodeBengali,
  103. UnicodeGurmukhi,
  104. UnicodeGujarati,
  105. UnicodeOriya,
  106. UnicodeTamil,
  107. UnicodeTelugu,
  108. UnicodeKannada,
  109. UnicodeMalayalam,
  110. UnicodeSinhala,
  111. UnicodeThai,
  112. UnicodeLao,
  113. UnicodeTibetan,
  114. UnicodeMyanmar,
  115. UnicodeGeorgian,
  116. UnicodeHangulJamo,
  117. UnicodeEthiopic,
  118. UnicodeCherokee,
  119. UnicodeUnifiedCanadianAboriginalSyllabics,
  120. UnicodeOgham,
  121. UnicodeRunic,
  122. UnicodeKhmer,
  123. UnicodeMongolian,
  124. UnicodeLatinExtendedAdditional,
  125. UnicodeGreekExtended,
  126. UnicodeGeneralPunctuation,
  127. UnicodeSuperscriptsandSubscripts,
  128. UnicodeCurrencySymbols,
  129. UnicodeCombiningMarksforSymbols,
  130. UnicodeLetterlikeSymbols,
  131. UnicodeNumberForms,
  132. UnicodeArrows,
  133. UnicodeMathematicalOperators,
  134. UnicodeMiscellaneousTechnical,
  135. UnicodeControlPictures,
  136. UnicodeOpticalCharacterRecognition,
  137. UnicodeEnclosedAlphanumerics,
  138. UnicodeBoxDrawing,
  139. UnicodeBlockElements,
  140. UnicodeGeometricShapes,
  141. UnicodeMiscellaneousSymbols,
  142. UnicodeDingbats,
  143. UnicodeBraillePatterns,
  144. UnicodeCJKRadicalsSupplement,
  145. UnicodeKangxiRadicals,
  146. UnicodeIdeographicDescriptionCharacters,
  147. UnicodeCJKSymbolsandPunctuation,
  148. UnicodeHiragana,
  149. UnicodeKatakana,
  150. UnicodeBopomofo,
  151. UnicodeHangulCompatibilityJamo,
  152. UnicodeKanbun,
  153. UnicodeBopomofoExtended,
  154. UnicodeEnclosedCJKLettersandMonths,
  155. UnicodeCJKCompatibility,
  156. UnicodeCJKUnifiedIdeographsExtensionA,
  157. UnicodeCJKUnifiedIdeographs,
  158. UnicodeYiSyllables,
  159. UnicodeYiRadicals,
  160. UnicodeHangulSyllables,
  161. UnicodeHighSurrogates,
  162. UnicodeHighPrivateUseSurrogates,
  163. UnicodeLowSurrogates,
  164. UnicodePrivateUse,
  165. UnicodeCJKCompatibilityIdeographs,
  166. UnicodeAlphabeticPresentationForms,
  167. UnicodeArabicPresentationFormsA, // *
  168. UnicodeCombiningHalfMarks,
  169. UnicodeCJKCompatibilityForms,
  170. UnicodeSmallFormVariants,
  171. UnicodeArabicPresentationFormsB, // *
  172. UnicodeSpecials,
  173. UnicodeHalfwidthandFullwidthForms,
  174. UnicodeOldItalic,
  175. UnicodeGothic,
  176. UnicodeDeseret,
  177. UnicodeByzantineMusicalSymbols,
  178. UnicodeMusicalSymbols,
  179. UnicodeMathematicalAlphanumericSymbols,
  180. UnicodeCJKUnifiedIdeographsExtensionB,
  181. UnicodeCJKCompatibilityIdeographsSupplement,
  182. UnicodeTags,
  183. LastValue // Keep this with the higher value in the enumeration
  184. }
  185. class CategoryUtils {
  186. public static Category CategoryFromName (string name) {
  187. try {
  188. if (name.StartsWith ("Is")) // remove prefix from block range
  189. name = name.Substring (2);
  190. return (Category)Enum.Parse (typeof (Category), "Unicode" + name);
  191. }
  192. catch (ArgumentException) {
  193. return Category.None;
  194. }
  195. }
  196. public static bool IsCategory (Category cat, char c) {
  197. switch (cat) {
  198. case Category.None:
  199. return false;
  200. case Category.Any:
  201. return c != '\n';
  202. case Category.AnySingleline:
  203. return true;
  204. case Category.Word:
  205. return
  206. Char.IsLetterOrDigit (c) ||
  207. IsCategory (UnicodeCategory.ConnectorPunctuation, c);
  208. case Category.Digit:
  209. return Char.IsDigit (c);
  210. case Category.WhiteSpace:
  211. return Char.IsWhiteSpace (c);
  212. // ECMA categories
  213. case Category.EcmaAny:
  214. return c != '\n';
  215. case Category.EcmaAnySingleline:
  216. return true;
  217. case Category.EcmaWord:
  218. return
  219. 'a' <= c && c <= 'z' ||
  220. 'A' <= c && c <= 'Z' ||
  221. '0' <= c && c <= '9' ||
  222. '_' == c;
  223. case Category.EcmaDigit:
  224. return
  225. '0' <= c && c <= 9;
  226. case Category.EcmaWhiteSpace:
  227. return
  228. c == ' ' ||
  229. c == '\f' ||
  230. c == '\n' ||
  231. c == '\r' ||
  232. c == '\t' ||
  233. c == '\v';
  234. // Unicode categories...
  235. // letter
  236. case Category.UnicodeLu: return IsCategory (UnicodeCategory.UppercaseLetter, c);
  237. case Category.UnicodeLl: return IsCategory (UnicodeCategory.LowercaseLetter, c);
  238. case Category.UnicodeLt: return IsCategory (UnicodeCategory.TitlecaseLetter, c);
  239. case Category.UnicodeLm: return IsCategory (UnicodeCategory.ModifierLetter, c);
  240. case Category.UnicodeLo: return IsCategory (UnicodeCategory.OtherLetter, c);
  241. // mark
  242. case Category.UnicodeMn: return IsCategory (UnicodeCategory.NonSpacingMark, c);
  243. case Category.UnicodeMe: return IsCategory (UnicodeCategory.EnclosingMark, c);
  244. case Category.UnicodeMc: return IsCategory (UnicodeCategory.SpacingCombiningMark, c);
  245. case Category.UnicodeNd: return IsCategory (UnicodeCategory.DecimalDigitNumber, c);
  246. // number
  247. case Category.UnicodeNl: return IsCategory (UnicodeCategory.LetterNumber, c);
  248. case Category.UnicodeNo: return IsCategory (UnicodeCategory.OtherNumber, c);
  249. // separator
  250. case Category.UnicodeZs: return IsCategory (UnicodeCategory.SpaceSeparator, c);
  251. case Category.UnicodeZl: return IsCategory (UnicodeCategory.LineSeparator, c);
  252. case Category.UnicodeZp: return IsCategory (UnicodeCategory.ParagraphSeparator, c);
  253. // punctuation
  254. case Category.UnicodePd: return IsCategory (UnicodeCategory.DashPunctuation, c);
  255. case Category.UnicodePs: return IsCategory (UnicodeCategory.OpenPunctuation, c);
  256. case Category.UnicodePi: return IsCategory (UnicodeCategory.InitialQuotePunctuation, c);
  257. case Category.UnicodePe: return IsCategory (UnicodeCategory.ClosePunctuation, c);
  258. case Category.UnicodePf: return IsCategory (UnicodeCategory.FinalQuotePunctuation, c);
  259. case Category.UnicodePc: return IsCategory (UnicodeCategory.ConnectorPunctuation, c);
  260. case Category.UnicodePo: return IsCategory (UnicodeCategory.OtherPunctuation, c);
  261. // symbol
  262. case Category.UnicodeSm: return IsCategory (UnicodeCategory.MathSymbol, c);
  263. case Category.UnicodeSc: return IsCategory (UnicodeCategory.CurrencySymbol, c);
  264. case Category.UnicodeSk: return IsCategory (UnicodeCategory.ModifierSymbol, c);
  265. case Category.UnicodeSo: return IsCategory (UnicodeCategory.OtherSymbol, c);
  266. // other
  267. case Category.UnicodeCc: return IsCategory (UnicodeCategory.Control, c);
  268. case Category.UnicodeCf: return IsCategory (UnicodeCategory.Format, c);
  269. case Category.UnicodeCo: return IsCategory (UnicodeCategory.PrivateUse, c);
  270. case Category.UnicodeCs: return IsCategory (UnicodeCategory.Surrogate, c);
  271. case Category.UnicodeCn: return IsCategory (UnicodeCategory.OtherNotAssigned, c);
  272. case Category.UnicodeL: // letter
  273. return
  274. IsCategory (UnicodeCategory.UppercaseLetter, c) ||
  275. IsCategory (UnicodeCategory.LowercaseLetter, c) ||
  276. IsCategory (UnicodeCategory.TitlecaseLetter, c) ||
  277. IsCategory (UnicodeCategory.ModifierLetter, c) ||
  278. IsCategory (UnicodeCategory.OtherLetter, c);
  279. case Category.UnicodeM: // mark
  280. return
  281. IsCategory (UnicodeCategory.NonSpacingMark, c) ||
  282. IsCategory (UnicodeCategory.EnclosingMark, c) ||
  283. IsCategory (UnicodeCategory.SpacingCombiningMark, c);
  284. case Category.UnicodeN: // number
  285. return
  286. IsCategory (UnicodeCategory.DecimalDigitNumber, c) ||
  287. IsCategory (UnicodeCategory.LetterNumber, c) ||
  288. IsCategory (UnicodeCategory.OtherNumber, c);
  289. case Category.UnicodeZ: // separator
  290. return
  291. IsCategory (UnicodeCategory.SpaceSeparator, c) ||
  292. IsCategory (UnicodeCategory.LineSeparator, c) ||
  293. IsCategory (UnicodeCategory.ParagraphSeparator, c);
  294. case Category.UnicodeP: // punctuation
  295. return
  296. IsCategory (UnicodeCategory.DashPunctuation, c) ||
  297. IsCategory (UnicodeCategory.OpenPunctuation, c) ||
  298. IsCategory (UnicodeCategory.InitialQuotePunctuation, c) ||
  299. IsCategory (UnicodeCategory.ClosePunctuation, c) ||
  300. IsCategory (UnicodeCategory.FinalQuotePunctuation, c) ||
  301. IsCategory (UnicodeCategory.ConnectorPunctuation, c) ||
  302. IsCategory (UnicodeCategory.OtherPunctuation, c);
  303. case Category.UnicodeS: // symbol
  304. return
  305. IsCategory (UnicodeCategory.MathSymbol, c) ||
  306. IsCategory (UnicodeCategory.CurrencySymbol, c) ||
  307. IsCategory (UnicodeCategory.ModifierSymbol, c) ||
  308. IsCategory (UnicodeCategory.OtherSymbol, c);
  309. case Category.UnicodeC: // other
  310. return
  311. IsCategory (UnicodeCategory.Control, c) ||
  312. IsCategory (UnicodeCategory.Format, c) ||
  313. IsCategory (UnicodeCategory.PrivateUse, c) ||
  314. IsCategory (UnicodeCategory.Surrogate, c) ||
  315. IsCategory (UnicodeCategory.OtherNotAssigned, c);
  316. // Unicode block ranges...
  317. case Category.UnicodeBasicLatin:
  318. return '\u0000' <= c && c <= '\u007F';
  319. case Category.UnicodeLatin1Supplement:
  320. return '\u0080' <= c && c <= '\u00FF';
  321. case Category.UnicodeLatinExtendedA:
  322. return '\u0100' <= c && c <= '\u017F';
  323. case Category.UnicodeLatinExtendedB:
  324. return '\u0180' <= c && c <= '\u024F';
  325. case Category.UnicodeIPAExtensions:
  326. return '\u0250' <= c && c <= '\u02AF';
  327. case Category.UnicodeSpacingModifierLetters:
  328. return '\u02B0' <= c && c <= '\u02FF';
  329. case Category.UnicodeCombiningDiacriticalMarks:
  330. return '\u0300' <= c && c <= '\u036F';
  331. case Category.UnicodeGreek:
  332. return '\u0370' <= c && c <= '\u03FF';
  333. case Category.UnicodeCyrillic:
  334. return '\u0400' <= c && c <= '\u04FF';
  335. case Category.UnicodeArmenian:
  336. return '\u0530' <= c && c <= '\u058F';
  337. case Category.UnicodeHebrew:
  338. return '\u0590' <= c && c <= '\u05FF';
  339. case Category.UnicodeArabic:
  340. return '\u0600' <= c && c <= '\u06FF';
  341. case Category.UnicodeSyriac:
  342. return '\u0700' <= c && c <= '\u074F';
  343. case Category.UnicodeThaana:
  344. return '\u0780' <= c && c <= '\u07BF';
  345. case Category.UnicodeDevanagari:
  346. return '\u0900' <= c && c <= '\u097F';
  347. case Category.UnicodeBengali:
  348. return '\u0980' <= c && c <= '\u09FF';
  349. case Category.UnicodeGurmukhi:
  350. return '\u0A00' <= c && c <= '\u0A7F';
  351. case Category.UnicodeGujarati:
  352. return '\u0A80' <= c && c <= '\u0AFF';
  353. case Category.UnicodeOriya:
  354. return '\u0B00' <= c && c <= '\u0B7F';
  355. case Category.UnicodeTamil:
  356. return '\u0B80' <= c && c <= '\u0BFF';
  357. case Category.UnicodeTelugu:
  358. return '\u0C00' <= c && c <= '\u0C7F';
  359. case Category.UnicodeKannada:
  360. return '\u0C80' <= c && c <= '\u0CFF';
  361. case Category.UnicodeMalayalam:
  362. return '\u0D00' <= c && c <= '\u0D7F';
  363. case Category.UnicodeSinhala:
  364. return '\u0D80' <= c && c <= '\u0DFF';
  365. case Category.UnicodeThai:
  366. return '\u0E00' <= c && c <= '\u0E7F';
  367. case Category.UnicodeLao:
  368. return '\u0E80' <= c && c <= '\u0EFF';
  369. case Category.UnicodeTibetan:
  370. return '\u0F00' <= c && c <= '\u0FFF';
  371. case Category.UnicodeMyanmar:
  372. return '\u1000' <= c && c <= '\u109F';
  373. case Category.UnicodeGeorgian:
  374. return '\u10A0' <= c && c <= '\u10FF';
  375. case Category.UnicodeHangulJamo:
  376. return '\u1100' <= c && c <= '\u11FF';
  377. case Category.UnicodeEthiopic:
  378. return '\u1200' <= c && c <= '\u137F';
  379. case Category.UnicodeCherokee:
  380. return '\u13A0' <= c && c <= '\u13FF';
  381. case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
  382. return '\u1400' <= c && c <= '\u167F';
  383. case Category.UnicodeOgham:
  384. return '\u1680' <= c && c <= '\u169F';
  385. case Category.UnicodeRunic:
  386. return '\u16A0' <= c && c <= '\u16FF';
  387. case Category.UnicodeKhmer:
  388. return '\u1780' <= c && c <= '\u17FF';
  389. case Category.UnicodeMongolian:
  390. return '\u1800' <= c && c <= '\u18AF';
  391. case Category.UnicodeLatinExtendedAdditional:
  392. return '\u1E00' <= c && c <= '\u1EFF';
  393. case Category.UnicodeGreekExtended:
  394. return '\u1F00' <= c && c <= '\u1FFF';
  395. case Category.UnicodeGeneralPunctuation:
  396. return '\u2000' <= c && c <= '\u206F';
  397. case Category.UnicodeSuperscriptsandSubscripts:
  398. return '\u2070' <= c && c <= '\u209F';
  399. case Category.UnicodeCurrencySymbols:
  400. return '\u20A0' <= c && c <= '\u20CF';
  401. case Category.UnicodeCombiningMarksforSymbols:
  402. return '\u20D0' <= c && c <= '\u20FF';
  403. case Category.UnicodeLetterlikeSymbols:
  404. return '\u2100' <= c && c <= '\u214F';
  405. case Category.UnicodeNumberForms:
  406. return '\u2150' <= c && c <= '\u218F';
  407. case Category.UnicodeArrows:
  408. return '\u2190' <= c && c <= '\u21FF';
  409. case Category.UnicodeMathematicalOperators:
  410. return '\u2200' <= c && c <= '\u22FF';
  411. case Category.UnicodeMiscellaneousTechnical:
  412. return '\u2300' <= c && c <= '\u23FF';
  413. case Category.UnicodeControlPictures:
  414. return '\u2400' <= c && c <= '\u243F';
  415. case Category.UnicodeOpticalCharacterRecognition:
  416. return '\u2440' <= c && c <= '\u245F';
  417. case Category.UnicodeEnclosedAlphanumerics:
  418. return '\u2460' <= c && c <= '\u24FF';
  419. case Category.UnicodeBoxDrawing:
  420. return '\u2500' <= c && c <= '\u257F';
  421. case Category.UnicodeBlockElements:
  422. return '\u2580' <= c && c <= '\u259F';
  423. case Category.UnicodeGeometricShapes:
  424. return '\u25A0' <= c && c <= '\u25FF';
  425. case Category.UnicodeMiscellaneousSymbols:
  426. return '\u2600' <= c && c <= '\u26FF';
  427. case Category.UnicodeDingbats:
  428. return '\u2700' <= c && c <= '\u27BF';
  429. case Category.UnicodeBraillePatterns:
  430. return '\u2800' <= c && c <= '\u28FF';
  431. case Category.UnicodeCJKRadicalsSupplement:
  432. return '\u2E80' <= c && c <= '\u2EFF';
  433. case Category.UnicodeKangxiRadicals:
  434. return '\u2F00' <= c && c <= '\u2FDF';
  435. case Category.UnicodeIdeographicDescriptionCharacters:
  436. return '\u2FF0' <= c && c <= '\u2FFF';
  437. case Category.UnicodeCJKSymbolsandPunctuation:
  438. return '\u3000' <= c && c <= '\u303F';
  439. case Category.UnicodeHiragana:
  440. return '\u3040' <= c && c <= '\u309F';
  441. case Category.UnicodeKatakana:
  442. return '\u30A0' <= c && c <= '\u30FF';
  443. case Category.UnicodeBopomofo:
  444. return '\u3100' <= c && c <= '\u312F';
  445. case Category.UnicodeHangulCompatibilityJamo:
  446. return '\u3130' <= c && c <= '\u318F';
  447. case Category.UnicodeKanbun:
  448. return '\u3190' <= c && c <= '\u319F';
  449. case Category.UnicodeBopomofoExtended:
  450. return '\u31A0' <= c && c <= '\u31BF';
  451. case Category.UnicodeEnclosedCJKLettersandMonths:
  452. return '\u3200' <= c && c <= '\u32FF';
  453. case Category.UnicodeCJKCompatibility:
  454. return '\u3300' <= c && c <= '\u33FF';
  455. case Category.UnicodeCJKUnifiedIdeographsExtensionA:
  456. return '\u3400' <= c && c <= '\u4DB5';
  457. case Category.UnicodeCJKUnifiedIdeographs:
  458. return '\u4E00' <= c && c <= '\u9FFF';
  459. case Category.UnicodeYiSyllables:
  460. return '\uA000' <= c && c <= '\uA48F';
  461. case Category.UnicodeYiRadicals:
  462. return '\uA490' <= c && c <= '\uA4CF';
  463. case Category.UnicodeHangulSyllables:
  464. return '\uAC00' <= c && c <= '\uD7A3';
  465. case Category.UnicodeHighSurrogates:
  466. return '\uD800' <= c && c <= '\uDB7F';
  467. case Category.UnicodeHighPrivateUseSurrogates:
  468. return '\uDB80' <= c && c <= '\uDBFF';
  469. case Category.UnicodeLowSurrogates:
  470. return '\uDC00' <= c && c <= '\uDFFF';
  471. case Category.UnicodePrivateUse:
  472. return '\uE000' <= c && c <= '\uF8FF';
  473. case Category.UnicodeCJKCompatibilityIdeographs:
  474. return '\uF900' <= c && c <= '\uFAFF';
  475. case Category.UnicodeAlphabeticPresentationForms:
  476. return '\uFB00' <= c && c <= '\uFB4F';
  477. case Category.UnicodeArabicPresentationFormsA:
  478. return '\uFB50' <= c && c <= '\uFDFF';
  479. case Category.UnicodeCombiningHalfMarks:
  480. return '\uFE20' <= c && c <= '\uFE2F';
  481. case Category.UnicodeCJKCompatibilityForms:
  482. return '\uFE30' <= c && c <= '\uFE4F';
  483. case Category.UnicodeSmallFormVariants:
  484. return '\uFE50' <= c && c <= '\uFE6F';
  485. case Category.UnicodeArabicPresentationFormsB:
  486. return '\uFE70' <= c && c <= '\uFEFE';
  487. case Category.UnicodeHalfwidthandFullwidthForms:
  488. return '\uFF00' <= c && c <= '\uFFEF';
  489. case Category.UnicodeSpecials:
  490. return
  491. '\uFEFF' <= c && c <= '\uFEFF' ||
  492. '\uFFF0' <= c && c <= '\uFFFD';
  493. // these block ranges begin above 0x10000
  494. case Category.UnicodeOldItalic:
  495. case Category.UnicodeGothic:
  496. case Category.UnicodeDeseret:
  497. case Category.UnicodeByzantineMusicalSymbols:
  498. case Category.UnicodeMusicalSymbols:
  499. case Category.UnicodeMathematicalAlphanumericSymbols:
  500. case Category.UnicodeCJKUnifiedIdeographsExtensionB:
  501. case Category.UnicodeCJKCompatibilityIdeographsSupplement:
  502. case Category.UnicodeTags:
  503. return false;
  504. default:
  505. return false;
  506. }
  507. }
  508. private static bool IsCategory (UnicodeCategory uc, char c) {
  509. if (Char.GetUnicodeCategory (c) == uc)
  510. return true;
  511. return false;
  512. }
  513. }
  514. }