category.cs 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637
  1. //
  2. // assembly: System
  3. // namespace: System.Text.RegularExpressions
  4. // file: category.cs
  5. //
  6. // author: Dan Lewis ([email protected])
  7. // (c) 2002
  8. using System;
  9. using System.Globalization;
  10. namespace System.Text.RegularExpressions {
  11. enum Category : ushort {
  12. None,
  13. // canonical classes
  14. Any, // any character except newline .
  15. AnySingleline, // any character . (s option)
  16. Word, // any word character \w
  17. Digit, // any digit character \d
  18. WhiteSpace, // any whitespace character \s
  19. // ECMAScript classes
  20. EcmaAny,
  21. EcmaAnySingleline,
  22. EcmaWord, // [a-zA-Z_0-9]
  23. EcmaDigit, // [0-9]
  24. EcmaWhiteSpace, // [ \f\n\r\t\v]
  25. // unicode categories
  26. UnicodeL, // Letter
  27. UnicodeM, // Mark
  28. UnicodeN, // Number
  29. UnicodeZ, // Separator
  30. UnicodeP, // Punctuation
  31. UnicodeS, // Symbol
  32. UnicodeC, // Other
  33. UnicodeLu, // UppercaseLetter
  34. UnicodeLl, // LowercaseLetter
  35. UnicodeLt, // TitlecaseLetter
  36. UnicodeLm, // ModifierLetter
  37. UnicodeLo, // OtherLetter
  38. UnicodeMn, // NonspacingMark
  39. UnicodeMe, // EnclosingMark
  40. UnicodeMc, // SpacingMark
  41. UnicodeNd, // DecimalNumber
  42. UnicodeNl, // LetterNumber
  43. UnicodeNo, // OtherNumber
  44. UnicodeZs, // SpaceSeparator
  45. UnicodeZl, // LineSeparator
  46. UnicodeZp, // ParagraphSeparator
  47. UnicodePd, // DashPunctuation
  48. UnicodePs, // OpenPunctuation
  49. UnicodePi, // InitialPunctuation
  50. UnicodePe, // ClosePunctuation
  51. UnicodePf, // FinalPunctuation
  52. UnicodePc, // ConnectorPunctuation
  53. UnicodePo, // OtherPunctuation
  54. UnicodeSm, // MathSymbol
  55. UnicodeSc, // CurrencySymbol
  56. UnicodeSk, // ModifierSymbol
  57. UnicodeSo, // OtherSymbol
  58. UnicodeCc, // Control
  59. UnicodeCf, // Format
  60. UnicodeCo, // PrivateUse
  61. UnicodeCs, // Surrogate
  62. UnicodeCn, // Unassigned
  63. // unicode block ranges
  64. // notes: the categories marked with a star are valid unicode block ranges,
  65. // but don't seem to be accepted by the MS parser using the /p{...} format.
  66. // any ideas?
  67. UnicodeBasicLatin,
  68. UnicodeLatin1Supplement, // *
  69. UnicodeLatinExtendedA, // *
  70. UnicodeLatinExtendedB, // *
  71. UnicodeIPAExtensions,
  72. UnicodeSpacingModifierLetters,
  73. UnicodeCombiningDiacriticalMarks,
  74. UnicodeGreek,
  75. UnicodeCyrillic,
  76. UnicodeArmenian,
  77. UnicodeHebrew,
  78. UnicodeArabic,
  79. UnicodeSyriac,
  80. UnicodeThaana,
  81. UnicodeDevanagari,
  82. UnicodeBengali,
  83. UnicodeGurmukhi,
  84. UnicodeGujarati,
  85. UnicodeOriya,
  86. UnicodeTamil,
  87. UnicodeTelugu,
  88. UnicodeKannada,
  89. UnicodeMalayalam,
  90. UnicodeSinhala,
  91. UnicodeThai,
  92. UnicodeLao,
  93. UnicodeTibetan,
  94. UnicodeMyanmar,
  95. UnicodeGeorgian,
  96. UnicodeHangulJamo,
  97. UnicodeEthiopic,
  98. UnicodeCherokee,
  99. UnicodeUnifiedCanadianAboriginalSyllabics,
  100. UnicodeOgham,
  101. UnicodeRunic,
  102. UnicodeKhmer,
  103. UnicodeMongolian,
  104. UnicodeLatinExtendedAdditional,
  105. UnicodeGreekExtended,
  106. UnicodeGeneralPunctuation,
  107. UnicodeSuperscriptsandSubscripts,
  108. UnicodeCurrencySymbols,
  109. UnicodeCombiningMarksforSymbols,
  110. UnicodeLetterlikeSymbols,
  111. UnicodeNumberForms,
  112. UnicodeArrows,
  113. UnicodeMathematicalOperators,
  114. UnicodeMiscellaneousTechnical,
  115. UnicodeControlPictures,
  116. UnicodeOpticalCharacterRecognition,
  117. UnicodeEnclosedAlphanumerics,
  118. UnicodeBoxDrawing,
  119. UnicodeBlockElements,
  120. UnicodeGeometricShapes,
  121. UnicodeMiscellaneousSymbols,
  122. UnicodeDingbats,
  123. UnicodeBraillePatterns,
  124. UnicodeCJKRadicalsSupplement,
  125. UnicodeKangxiRadicals,
  126. UnicodeIdeographicDescriptionCharacters,
  127. UnicodeCJKSymbolsandPunctuation,
  128. UnicodeHiragana,
  129. UnicodeKatakana,
  130. UnicodeBopomofo,
  131. UnicodeHangulCompatibilityJamo,
  132. UnicodeKanbun,
  133. UnicodeBopomofoExtended,
  134. UnicodeEnclosedCJKLettersandMonths,
  135. UnicodeCJKCompatibility,
  136. UnicodeCJKUnifiedIdeographsExtensionA,
  137. UnicodeCJKUnifiedIdeographs,
  138. UnicodeYiSyllables,
  139. UnicodeYiRadicals,
  140. UnicodeHangulSyllables,
  141. UnicodeHighSurrogates,
  142. UnicodeHighPrivateUseSurrogates,
  143. UnicodeLowSurrogates,
  144. UnicodePrivateUse,
  145. UnicodeCJKCompatibilityIdeographs,
  146. UnicodeAlphabeticPresentationForms,
  147. UnicodeArabicPresentationFormsA, // *
  148. UnicodeCombiningHalfMarks,
  149. UnicodeCJKCompatibilityForms,
  150. UnicodeSmallFormVariants,
  151. UnicodeArabicPresentationFormsB, // *
  152. UnicodeSpecials,
  153. UnicodeHalfwidthandFullwidthForms,
  154. UnicodeOldItalic,
  155. UnicodeGothic,
  156. UnicodeDeseret,
  157. UnicodeByzantineMusicalSymbols,
  158. UnicodeMusicalSymbols,
  159. UnicodeMathematicalAlphanumericSymbols,
  160. UnicodeCJKUnifiedIdeographsExtensionB,
  161. UnicodeCJKCompatibilityIdeographsSupplement,
  162. UnicodeTags
  163. }
  164. class CategoryUtils {
  165. public static Category CategoryFromName (string name) {
  166. try {
  167. if (name.Substring (0, 2).Equals ("Is")) // remove prefix from block range
  168. name = name.Substring (2);
  169. return (Category)Enum.Parse (typeof (Category), "Unicode" + name);
  170. }
  171. catch (ArgumentException) {
  172. return Category.None;
  173. }
  174. }
  175. public static bool IsCategory (Category cat, char c) {
  176. switch (cat) {
  177. case Category.None:
  178. return false;
  179. case Category.Any:
  180. return c != '\n';
  181. case Category.AnySingleline:
  182. return true;
  183. case Category.Word:
  184. return
  185. Char.IsLetterOrDigit (c) ||
  186. IsCategory (UnicodeCategory.ConnectorPunctuation, c);
  187. case Category.Digit:
  188. return Char.IsDigit (c);
  189. case Category.WhiteSpace:
  190. return Char.IsWhiteSpace (c);
  191. // ECMA categories
  192. case Category.EcmaAny:
  193. return c != '\n';
  194. case Category.EcmaAnySingleline:
  195. return true;
  196. case Category.EcmaWord:
  197. return
  198. 'a' <= c && c <= 'z' ||
  199. 'A' <= c && c <= 'Z' ||
  200. '0' <= c && c <= '9' ||
  201. '_' == c;
  202. case Category.EcmaDigit:
  203. return
  204. '0' <= c && c <= 9;
  205. case Category.EcmaWhiteSpace:
  206. return
  207. c == ' ' ||
  208. c == '\f' ||
  209. c == '\n' ||
  210. c == '\r' ||
  211. c == '\t' ||
  212. c == '\v';
  213. // Unicode categories...
  214. // letter
  215. case Category.UnicodeLu: return IsCategory (UnicodeCategory.UppercaseLetter, c);
  216. case Category.UnicodeLl: return IsCategory (UnicodeCategory.LowercaseLetter, c);
  217. case Category.UnicodeLt: return IsCategory (UnicodeCategory.TitlecaseLetter, c);
  218. case Category.UnicodeLm: return IsCategory (UnicodeCategory.ModifierLetter, c);
  219. case Category.UnicodeLo: return IsCategory (UnicodeCategory.OtherLetter, c);
  220. // mark
  221. case Category.UnicodeMn: return IsCategory (UnicodeCategory.NonSpacingMark, c);
  222. case Category.UnicodeMe: return IsCategory (UnicodeCategory.EnclosingMark, c);
  223. case Category.UnicodeMc: return IsCategory (UnicodeCategory.SpacingCombiningMark, c);
  224. case Category.UnicodeNd: return IsCategory (UnicodeCategory.DecimalDigitNumber, c);
  225. // number
  226. case Category.UnicodeNl: return IsCategory (UnicodeCategory.LetterNumber, c);
  227. case Category.UnicodeNo: return IsCategory (UnicodeCategory.OtherNumber, c);
  228. // separator
  229. case Category.UnicodeZs: return IsCategory (UnicodeCategory.SpaceSeparator, c);
  230. case Category.UnicodeZl: return IsCategory (UnicodeCategory.LineSeparator, c);
  231. case Category.UnicodeZp: return IsCategory (UnicodeCategory.ParagraphSeparator, c);
  232. // punctuation
  233. case Category.UnicodePd: return IsCategory (UnicodeCategory.DashPunctuation, c);
  234. case Category.UnicodePs: return IsCategory (UnicodeCategory.OpenPunctuation, c);
  235. case Category.UnicodePi: return IsCategory (UnicodeCategory.InitialQuotePunctuation, c);
  236. case Category.UnicodePe: return IsCategory (UnicodeCategory.ClosePunctuation, c);
  237. case Category.UnicodePf: return IsCategory (UnicodeCategory.FinalQuotePunctuation, c);
  238. case Category.UnicodePc: return IsCategory (UnicodeCategory.ConnectorPunctuation, c);
  239. case Category.UnicodePo: return IsCategory (UnicodeCategory.OtherPunctuation, c);
  240. // symbol
  241. case Category.UnicodeSm: return IsCategory (UnicodeCategory.MathSymbol, c);
  242. case Category.UnicodeSc: return IsCategory (UnicodeCategory.CurrencySymbol, c);
  243. case Category.UnicodeSk: return IsCategory (UnicodeCategory.ModifierSymbol, c);
  244. case Category.UnicodeSo: return IsCategory (UnicodeCategory.OtherSymbol, c);
  245. // other
  246. case Category.UnicodeCc: return IsCategory (UnicodeCategory.Control, c);
  247. case Category.UnicodeCf: return IsCategory (UnicodeCategory.Format, c);
  248. case Category.UnicodeCo: return IsCategory (UnicodeCategory.PrivateUse, c);
  249. case Category.UnicodeCs: return IsCategory (UnicodeCategory.Surrogate, c);
  250. case Category.UnicodeCn: return IsCategory (UnicodeCategory.OtherNotAssigned, c);
  251. case Category.UnicodeL: // letter
  252. return
  253. IsCategory (UnicodeCategory.UppercaseLetter, c) ||
  254. IsCategory (UnicodeCategory.LowercaseLetter, c) ||
  255. IsCategory (UnicodeCategory.TitlecaseLetter, c) ||
  256. IsCategory (UnicodeCategory.ModifierLetter, c) ||
  257. IsCategory (UnicodeCategory.OtherLetter, c);
  258. case Category.UnicodeM: // mark
  259. return
  260. IsCategory (UnicodeCategory.NonSpacingMark, c) ||
  261. IsCategory (UnicodeCategory.EnclosingMark, c) ||
  262. IsCategory (UnicodeCategory.SpacingCombiningMark, c);
  263. case Category.UnicodeN: // number
  264. return
  265. IsCategory (UnicodeCategory.DecimalDigitNumber, c) ||
  266. IsCategory (UnicodeCategory.LetterNumber, c) ||
  267. IsCategory (UnicodeCategory.OtherNumber, c);
  268. case Category.UnicodeZ: // separator
  269. return
  270. IsCategory (UnicodeCategory.SpaceSeparator, c) ||
  271. IsCategory (UnicodeCategory.LineSeparator, c) ||
  272. IsCategory (UnicodeCategory.ParagraphSeparator, c);
  273. case Category.UnicodeP: // punctuation
  274. return
  275. IsCategory (UnicodeCategory.DashPunctuation, c) ||
  276. IsCategory (UnicodeCategory.OpenPunctuation, c) ||
  277. IsCategory (UnicodeCategory.InitialQuotePunctuation, c) ||
  278. IsCategory (UnicodeCategory.ClosePunctuation, c) ||
  279. IsCategory (UnicodeCategory.FinalQuotePunctuation, c) ||
  280. IsCategory (UnicodeCategory.ConnectorPunctuation, c) ||
  281. IsCategory (UnicodeCategory.OtherPunctuation, c);
  282. case Category.UnicodeS: // symbol
  283. return
  284. IsCategory (UnicodeCategory.MathSymbol, c) ||
  285. IsCategory (UnicodeCategory.CurrencySymbol, c) ||
  286. IsCategory (UnicodeCategory.ModifierSymbol, c) ||
  287. IsCategory (UnicodeCategory.OtherSymbol, c);
  288. case Category.UnicodeC: // other
  289. return
  290. IsCategory (UnicodeCategory.Control, c) ||
  291. IsCategory (UnicodeCategory.Format, c) ||
  292. IsCategory (UnicodeCategory.PrivateUse, c) ||
  293. IsCategory (UnicodeCategory.Surrogate, c) ||
  294. IsCategory (UnicodeCategory.OtherNotAssigned, c);
  295. // Unicode block ranges...
  296. case Category.UnicodeBasicLatin:
  297. return '\u0000' <= c && c <= '\u007F';
  298. case Category.UnicodeLatin1Supplement:
  299. return '\u0080' <= c && c <= '\u00FF';
  300. case Category.UnicodeLatinExtendedA:
  301. return '\u0100' <= c && c <= '\u017F';
  302. case Category.UnicodeLatinExtendedB:
  303. return '\u0180' <= c && c <= '\u024F';
  304. case Category.UnicodeIPAExtensions:
  305. return '\u0250' <= c && c <= '\u02AF';
  306. case Category.UnicodeSpacingModifierLetters:
  307. return '\u02B0' <= c && c <= '\u02FF';
  308. case Category.UnicodeCombiningDiacriticalMarks:
  309. return '\u0300' <= c && c <= '\u036F';
  310. case Category.UnicodeGreek:
  311. return '\u0370' <= c && c <= '\u03FF';
  312. case Category.UnicodeCyrillic:
  313. return '\u0400' <= c && c <= '\u04FF';
  314. case Category.UnicodeArmenian:
  315. return '\u0530' <= c && c <= '\u058F';
  316. case Category.UnicodeHebrew:
  317. return '\u0590' <= c && c <= '\u05FF';
  318. case Category.UnicodeArabic:
  319. return '\u0600' <= c && c <= '\u06FF';
  320. case Category.UnicodeSyriac:
  321. return '\u0700' <= c && c <= '\u074F';
  322. case Category.UnicodeThaana:
  323. return '\u0780' <= c && c <= '\u07BF';
  324. case Category.UnicodeDevanagari:
  325. return '\u0900' <= c && c <= '\u097F';
  326. case Category.UnicodeBengali:
  327. return '\u0980' <= c && c <= '\u09FF';
  328. case Category.UnicodeGurmukhi:
  329. return '\u0A00' <= c && c <= '\u0A7F';
  330. case Category.UnicodeGujarati:
  331. return '\u0A80' <= c && c <= '\u0AFF';
  332. case Category.UnicodeOriya:
  333. return '\u0B00' <= c && c <= '\u0B7F';
  334. case Category.UnicodeTamil:
  335. return '\u0B80' <= c && c <= '\u0BFF';
  336. case Category.UnicodeTelugu:
  337. return '\u0C00' <= c && c <= '\u0C7F';
  338. case Category.UnicodeKannada:
  339. return '\u0C80' <= c && c <= '\u0CFF';
  340. case Category.UnicodeMalayalam:
  341. return '\u0D00' <= c && c <= '\u0D7F';
  342. case Category.UnicodeSinhala:
  343. return '\u0D80' <= c && c <= '\u0DFF';
  344. case Category.UnicodeThai:
  345. return '\u0E00' <= c && c <= '\u0E7F';
  346. case Category.UnicodeLao:
  347. return '\u0E80' <= c && c <= '\u0EFF';
  348. case Category.UnicodeTibetan:
  349. return '\u0F00' <= c && c <= '\u0FFF';
  350. case Category.UnicodeMyanmar:
  351. return '\u1000' <= c && c <= '\u109F';
  352. case Category.UnicodeGeorgian:
  353. return '\u10A0' <= c && c <= '\u10FF';
  354. case Category.UnicodeHangulJamo:
  355. return '\u1100' <= c && c <= '\u11FF';
  356. case Category.UnicodeEthiopic:
  357. return '\u1200' <= c && c <= '\u137F';
  358. case Category.UnicodeCherokee:
  359. return '\u13A0' <= c && c <= '\u13FF';
  360. case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
  361. return '\u1400' <= c && c <= '\u167F';
  362. case Category.UnicodeOgham:
  363. return '\u1680' <= c && c <= '\u169F';
  364. case Category.UnicodeRunic:
  365. return '\u16A0' <= c && c <= '\u16FF';
  366. case Category.UnicodeKhmer:
  367. return '\u1780' <= c && c <= '\u17FF';
  368. case Category.UnicodeMongolian:
  369. return '\u1800' <= c && c <= '\u18AF';
  370. case Category.UnicodeLatinExtendedAdditional:
  371. return '\u1E00' <= c && c <= '\u1EFF';
  372. case Category.UnicodeGreekExtended:
  373. return '\u1F00' <= c && c <= '\u1FFF';
  374. case Category.UnicodeGeneralPunctuation:
  375. return '\u2000' <= c && c <= '\u206F';
  376. case Category.UnicodeSuperscriptsandSubscripts:
  377. return '\u2070' <= c && c <= '\u209F';
  378. case Category.UnicodeCurrencySymbols:
  379. return '\u20A0' <= c && c <= '\u20CF';
  380. case Category.UnicodeCombiningMarksforSymbols:
  381. return '\u20D0' <= c && c <= '\u20FF';
  382. case Category.UnicodeLetterlikeSymbols:
  383. return '\u2100' <= c && c <= '\u214F';
  384. case Category.UnicodeNumberForms:
  385. return '\u2150' <= c && c <= '\u218F';
  386. case Category.UnicodeArrows:
  387. return '\u2190' <= c && c <= '\u21FF';
  388. case Category.UnicodeMathematicalOperators:
  389. return '\u2200' <= c && c <= '\u22FF';
  390. case Category.UnicodeMiscellaneousTechnical:
  391. return '\u2300' <= c && c <= '\u23FF';
  392. case Category.UnicodeControlPictures:
  393. return '\u2400' <= c && c <= '\u243F';
  394. case Category.UnicodeOpticalCharacterRecognition:
  395. return '\u2440' <= c && c <= '\u245F';
  396. case Category.UnicodeEnclosedAlphanumerics:
  397. return '\u2460' <= c && c <= '\u24FF';
  398. case Category.UnicodeBoxDrawing:
  399. return '\u2500' <= c && c <= '\u257F';
  400. case Category.UnicodeBlockElements:
  401. return '\u2580' <= c && c <= '\u259F';
  402. case Category.UnicodeGeometricShapes:
  403. return '\u25A0' <= c && c <= '\u25FF';
  404. case Category.UnicodeMiscellaneousSymbols:
  405. return '\u2600' <= c && c <= '\u26FF';
  406. case Category.UnicodeDingbats:
  407. return '\u2700' <= c && c <= '\u27BF';
  408. case Category.UnicodeBraillePatterns:
  409. return '\u2800' <= c && c <= '\u28FF';
  410. case Category.UnicodeCJKRadicalsSupplement:
  411. return '\u2E80' <= c && c <= '\u2EFF';
  412. case Category.UnicodeKangxiRadicals:
  413. return '\u2F00' <= c && c <= '\u2FDF';
  414. case Category.UnicodeIdeographicDescriptionCharacters:
  415. return '\u2FF0' <= c && c <= '\u2FFF';
  416. case Category.UnicodeCJKSymbolsandPunctuation:
  417. return '\u3000' <= c && c <= '\u303F';
  418. case Category.UnicodeHiragana:
  419. return '\u3040' <= c && c <= '\u309F';
  420. case Category.UnicodeKatakana:
  421. return '\u30A0' <= c && c <= '\u30FF';
  422. case Category.UnicodeBopomofo:
  423. return '\u3100' <= c && c <= '\u312F';
  424. case Category.UnicodeHangulCompatibilityJamo:
  425. return '\u3130' <= c && c <= '\u318F';
  426. case Category.UnicodeKanbun:
  427. return '\u3190' <= c && c <= '\u319F';
  428. case Category.UnicodeBopomofoExtended:
  429. return '\u31A0' <= c && c <= '\u31BF';
  430. case Category.UnicodeEnclosedCJKLettersandMonths:
  431. return '\u3200' <= c && c <= '\u32FF';
  432. case Category.UnicodeCJKCompatibility:
  433. return '\u3300' <= c && c <= '\u33FF';
  434. case Category.UnicodeCJKUnifiedIdeographsExtensionA:
  435. return '\u3400' <= c && c <= '\u4DB5';
  436. case Category.UnicodeCJKUnifiedIdeographs:
  437. return '\u4E00' <= c && c <= '\u9FFF';
  438. case Category.UnicodeYiSyllables:
  439. return '\uA000' <= c && c <= '\uA48F';
  440. case Category.UnicodeYiRadicals:
  441. return '\uA490' <= c && c <= '\uA4CF';
  442. case Category.UnicodeHangulSyllables:
  443. return '\uAC00' <= c && c <= '\uD7A3';
  444. case Category.UnicodeHighSurrogates:
  445. return '\uD800' <= c && c <= '\uDB7F';
  446. case Category.UnicodeHighPrivateUseSurrogates:
  447. return '\uDB80' <= c && c <= '\uDBFF';
  448. case Category.UnicodeLowSurrogates:
  449. return '\uDC00' <= c && c <= '\uDFFF';
  450. case Category.UnicodePrivateUse:
  451. return '\uE000' <= c && c <= '\uF8FF';
  452. case Category.UnicodeCJKCompatibilityIdeographs:
  453. return '\uF900' <= c && c <= '\uFAFF';
  454. case Category.UnicodeAlphabeticPresentationForms:
  455. return '\uFB00' <= c && c <= '\uFB4F';
  456. case Category.UnicodeArabicPresentationFormsA:
  457. return '\uFB50' <= c && c <= '\uFDFF';
  458. case Category.UnicodeCombiningHalfMarks:
  459. return '\uFE20' <= c && c <= '\uFE2F';
  460. case Category.UnicodeCJKCompatibilityForms:
  461. return '\uFE30' <= c && c <= '\uFE4F';
  462. case Category.UnicodeSmallFormVariants:
  463. return '\uFE50' <= c && c <= '\uFE6F';
  464. case Category.UnicodeArabicPresentationFormsB:
  465. return '\uFE70' <= c && c <= '\uFEFE';
  466. case Category.UnicodeHalfwidthandFullwidthForms:
  467. return '\uFF00' <= c && c <= '\uFFEF';
  468. case Category.UnicodeSpecials:
  469. return
  470. '\uFEFF' <= c && c <= '\uFEFF' ||
  471. '\uFFF0' <= c && c <= '\uFFFD';
  472. // these block ranges begin above 0x10000
  473. case Category.UnicodeOldItalic:
  474. case Category.UnicodeGothic:
  475. case Category.UnicodeDeseret:
  476. case Category.UnicodeByzantineMusicalSymbols:
  477. case Category.UnicodeMusicalSymbols:
  478. case Category.UnicodeMathematicalAlphanumericSymbols:
  479. case Category.UnicodeCJKUnifiedIdeographsExtensionB:
  480. case Category.UnicodeCJKCompatibilityIdeographsSupplement:
  481. case Category.UnicodeTags:
  482. return false;
  483. default:
  484. return false;
  485. }
  486. }
  487. private static bool IsCategory (UnicodeCategory uc, char c) {
  488. if (Char.GetUnicodeCategory (c) == uc)
  489. return true;
  490. return false;
  491. }
  492. }
  493. }