| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637 |
- //
- // assembly: System
- // namespace: System.Text.RegularExpressions
- // file: category.cs
- //
- // author: Dan Lewis ([email protected])
- // (c) 2002
- using System;
- using System.Globalization;
- namespace System.Text.RegularExpressions {
- enum Category : ushort {
- None,
- // canonical classes
-
- Any, // any character except newline .
- AnySingleline, // any character . (s option)
- Word, // any word character \w
- Digit, // any digit character \d
- WhiteSpace, // any whitespace character \s
-
- // ECMAScript classes
- EcmaAny,
- EcmaAnySingleline,
- EcmaWord, // [a-zA-Z_0-9]
- EcmaDigit, // [0-9]
- EcmaWhiteSpace, // [ \f\n\r\t\v]
- // unicode categories
-
- UnicodeL, // Letter
- UnicodeM, // Mark
- UnicodeN, // Number
- UnicodeZ, // Separator
- UnicodeP, // Punctuation
- UnicodeS, // Symbol
- UnicodeC, // Other
- UnicodeLu, // UppercaseLetter
- UnicodeLl, // LowercaseLetter
- UnicodeLt, // TitlecaseLetter
- UnicodeLm, // ModifierLetter
- UnicodeLo, // OtherLetter
- UnicodeMn, // NonspacingMark
- UnicodeMe, // EnclosingMark
- UnicodeMc, // SpacingMark
- UnicodeNd, // DecimalNumber
- UnicodeNl, // LetterNumber
- UnicodeNo, // OtherNumber
- UnicodeZs, // SpaceSeparator
- UnicodeZl, // LineSeparator
- UnicodeZp, // ParagraphSeparator
- UnicodePd, // DashPunctuation
- UnicodePs, // OpenPunctuation
- UnicodePi, // InitialPunctuation
- UnicodePe, // ClosePunctuation
- UnicodePf, // FinalPunctuation
- UnicodePc, // ConnectorPunctuation
- UnicodePo, // OtherPunctuation
- UnicodeSm, // MathSymbol
- UnicodeSc, // CurrencySymbol
- UnicodeSk, // ModifierSymbol
- UnicodeSo, // OtherSymbol
- UnicodeCc, // Control
- UnicodeCf, // Format
- UnicodeCo, // PrivateUse
- UnicodeCs, // Surrogate
- UnicodeCn, // Unassigned
- // unicode block ranges
- // notes: the categories marked with a star are valid unicode block ranges,
- // but don't seem to be accepted by the MS parser using the /p{...} format.
- // any ideas?
- UnicodeBasicLatin,
- UnicodeLatin1Supplement, // *
- UnicodeLatinExtendedA, // *
- UnicodeLatinExtendedB, // *
- UnicodeIPAExtensions,
- UnicodeSpacingModifierLetters,
- UnicodeCombiningDiacriticalMarks,
- UnicodeGreek,
- UnicodeCyrillic,
- UnicodeArmenian,
- UnicodeHebrew,
- UnicodeArabic,
- UnicodeSyriac,
- UnicodeThaana,
- UnicodeDevanagari,
- UnicodeBengali,
- UnicodeGurmukhi,
- UnicodeGujarati,
- UnicodeOriya,
- UnicodeTamil,
- UnicodeTelugu,
- UnicodeKannada,
- UnicodeMalayalam,
- UnicodeSinhala,
- UnicodeThai,
- UnicodeLao,
- UnicodeTibetan,
- UnicodeMyanmar,
- UnicodeGeorgian,
- UnicodeHangulJamo,
- UnicodeEthiopic,
- UnicodeCherokee,
- UnicodeUnifiedCanadianAboriginalSyllabics,
- UnicodeOgham,
- UnicodeRunic,
- UnicodeKhmer,
- UnicodeMongolian,
- UnicodeLatinExtendedAdditional,
- UnicodeGreekExtended,
- UnicodeGeneralPunctuation,
- UnicodeSuperscriptsandSubscripts,
- UnicodeCurrencySymbols,
- UnicodeCombiningMarksforSymbols,
- UnicodeLetterlikeSymbols,
- UnicodeNumberForms,
- UnicodeArrows,
- UnicodeMathematicalOperators,
- UnicodeMiscellaneousTechnical,
- UnicodeControlPictures,
- UnicodeOpticalCharacterRecognition,
- UnicodeEnclosedAlphanumerics,
- UnicodeBoxDrawing,
- UnicodeBlockElements,
- UnicodeGeometricShapes,
- UnicodeMiscellaneousSymbols,
- UnicodeDingbats,
- UnicodeBraillePatterns,
- UnicodeCJKRadicalsSupplement,
- UnicodeKangxiRadicals,
- UnicodeIdeographicDescriptionCharacters,
- UnicodeCJKSymbolsandPunctuation,
- UnicodeHiragana,
- UnicodeKatakana,
- UnicodeBopomofo,
- UnicodeHangulCompatibilityJamo,
- UnicodeKanbun,
- UnicodeBopomofoExtended,
- UnicodeEnclosedCJKLettersandMonths,
- UnicodeCJKCompatibility,
- UnicodeCJKUnifiedIdeographsExtensionA,
- UnicodeCJKUnifiedIdeographs,
- UnicodeYiSyllables,
- UnicodeYiRadicals,
- UnicodeHangulSyllables,
- UnicodeHighSurrogates,
- UnicodeHighPrivateUseSurrogates,
- UnicodeLowSurrogates,
- UnicodePrivateUse,
- UnicodeCJKCompatibilityIdeographs,
- UnicodeAlphabeticPresentationForms,
- UnicodeArabicPresentationFormsA, // *
- UnicodeCombiningHalfMarks,
- UnicodeCJKCompatibilityForms,
- UnicodeSmallFormVariants,
- UnicodeArabicPresentationFormsB, // *
- UnicodeSpecials,
- UnicodeHalfwidthandFullwidthForms,
-
- UnicodeOldItalic,
- UnicodeGothic,
- UnicodeDeseret,
- UnicodeByzantineMusicalSymbols,
- UnicodeMusicalSymbols,
- UnicodeMathematicalAlphanumericSymbols,
- UnicodeCJKUnifiedIdeographsExtensionB,
- UnicodeCJKCompatibilityIdeographsSupplement,
- UnicodeTags
- }
- class CategoryUtils {
- public static Category CategoryFromName (string name) {
- try {
- if (name.Substring (0, 2).Equals ("Is")) // remove prefix from block range
- name = name.Substring (2);
- return (Category)Enum.Parse (typeof (Category), "Unicode" + name);
- }
- catch (ArgumentException) {
- return Category.None;
- }
- }
-
- public static bool IsCategory (Category cat, char c) {
- switch (cat) {
- case Category.None:
- return false;
-
- case Category.Any:
- return c != '\n';
- case Category.AnySingleline:
- return true;
- case Category.Word:
- return
- Char.IsLetterOrDigit (c) ||
- IsCategory (UnicodeCategory.ConnectorPunctuation, c);
- case Category.Digit:
- return Char.IsDigit (c);
- case Category.WhiteSpace:
- return Char.IsWhiteSpace (c);
- // ECMA categories
- case Category.EcmaAny:
- return c != '\n';
-
- case Category.EcmaAnySingleline:
- return true;
- case Category.EcmaWord:
- return
- 'a' <= c && c <= 'z' ||
- 'A' <= c && c <= 'Z' ||
- '0' <= c && c <= '9' ||
- '_' == c;
- case Category.EcmaDigit:
- return
- '0' <= c && c <= 9;
-
- case Category.EcmaWhiteSpace:
- return
- c == ' ' ||
- c == '\f' ||
- c == '\n' ||
- c == '\r' ||
- c == '\t' ||
- c == '\v';
- // Unicode categories...
- // letter
-
- case Category.UnicodeLu: return IsCategory (UnicodeCategory.UppercaseLetter, c);
- case Category.UnicodeLl: return IsCategory (UnicodeCategory.LowercaseLetter, c);
- case Category.UnicodeLt: return IsCategory (UnicodeCategory.TitlecaseLetter, c);
- case Category.UnicodeLm: return IsCategory (UnicodeCategory.ModifierLetter, c);
- case Category.UnicodeLo: return IsCategory (UnicodeCategory.OtherLetter, c);
- // mark
- case Category.UnicodeMn: return IsCategory (UnicodeCategory.NonSpacingMark, c);
- case Category.UnicodeMe: return IsCategory (UnicodeCategory.EnclosingMark, c);
- case Category.UnicodeMc: return IsCategory (UnicodeCategory.SpacingCombiningMark, c);
- case Category.UnicodeNd: return IsCategory (UnicodeCategory.DecimalDigitNumber, c);
- // number
- case Category.UnicodeNl: return IsCategory (UnicodeCategory.LetterNumber, c);
- case Category.UnicodeNo: return IsCategory (UnicodeCategory.OtherNumber, c);
- // separator
- case Category.UnicodeZs: return IsCategory (UnicodeCategory.SpaceSeparator, c);
- case Category.UnicodeZl: return IsCategory (UnicodeCategory.LineSeparator, c);
- case Category.UnicodeZp: return IsCategory (UnicodeCategory.ParagraphSeparator, c);
- // punctuation
- case Category.UnicodePd: return IsCategory (UnicodeCategory.DashPunctuation, c);
- case Category.UnicodePs: return IsCategory (UnicodeCategory.OpenPunctuation, c);
- case Category.UnicodePi: return IsCategory (UnicodeCategory.InitialQuotePunctuation, c);
- case Category.UnicodePe: return IsCategory (UnicodeCategory.ClosePunctuation, c);
- case Category.UnicodePf: return IsCategory (UnicodeCategory.FinalQuotePunctuation, c);
- case Category.UnicodePc: return IsCategory (UnicodeCategory.ConnectorPunctuation, c);
- case Category.UnicodePo: return IsCategory (UnicodeCategory.OtherPunctuation, c);
- // symbol
- case Category.UnicodeSm: return IsCategory (UnicodeCategory.MathSymbol, c);
- case Category.UnicodeSc: return IsCategory (UnicodeCategory.CurrencySymbol, c);
- case Category.UnicodeSk: return IsCategory (UnicodeCategory.ModifierSymbol, c);
- case Category.UnicodeSo: return IsCategory (UnicodeCategory.OtherSymbol, c);
- // other
- case Category.UnicodeCc: return IsCategory (UnicodeCategory.Control, c);
- case Category.UnicodeCf: return IsCategory (UnicodeCategory.Format, c);
- case Category.UnicodeCo: return IsCategory (UnicodeCategory.PrivateUse, c);
- case Category.UnicodeCs: return IsCategory (UnicodeCategory.Surrogate, c);
- case Category.UnicodeCn: return IsCategory (UnicodeCategory.OtherNotAssigned, c);
- case Category.UnicodeL: // letter
- return
- IsCategory (UnicodeCategory.UppercaseLetter, c) ||
- IsCategory (UnicodeCategory.LowercaseLetter, c) ||
- IsCategory (UnicodeCategory.TitlecaseLetter, c) ||
- IsCategory (UnicodeCategory.ModifierLetter, c) ||
- IsCategory (UnicodeCategory.OtherLetter, c);
-
- case Category.UnicodeM: // mark
- return
- IsCategory (UnicodeCategory.NonSpacingMark, c) ||
- IsCategory (UnicodeCategory.EnclosingMark, c) ||
- IsCategory (UnicodeCategory.SpacingCombiningMark, c);
- case Category.UnicodeN: // number
- return
- IsCategory (UnicodeCategory.DecimalDigitNumber, c) ||
- IsCategory (UnicodeCategory.LetterNumber, c) ||
- IsCategory (UnicodeCategory.OtherNumber, c);
- case Category.UnicodeZ: // separator
- return
- IsCategory (UnicodeCategory.SpaceSeparator, c) ||
- IsCategory (UnicodeCategory.LineSeparator, c) ||
- IsCategory (UnicodeCategory.ParagraphSeparator, c);
-
- case Category.UnicodeP: // punctuation
- return
- IsCategory (UnicodeCategory.DashPunctuation, c) ||
- IsCategory (UnicodeCategory.OpenPunctuation, c) ||
- IsCategory (UnicodeCategory.InitialQuotePunctuation, c) ||
- IsCategory (UnicodeCategory.ClosePunctuation, c) ||
- IsCategory (UnicodeCategory.FinalQuotePunctuation, c) ||
- IsCategory (UnicodeCategory.ConnectorPunctuation, c) ||
- IsCategory (UnicodeCategory.OtherPunctuation, c);
-
- case Category.UnicodeS: // symbol
- return
- IsCategory (UnicodeCategory.MathSymbol, c) ||
- IsCategory (UnicodeCategory.CurrencySymbol, c) ||
- IsCategory (UnicodeCategory.ModifierSymbol, c) ||
- IsCategory (UnicodeCategory.OtherSymbol, c);
-
- case Category.UnicodeC: // other
- return
- IsCategory (UnicodeCategory.Control, c) ||
- IsCategory (UnicodeCategory.Format, c) ||
- IsCategory (UnicodeCategory.PrivateUse, c) ||
- IsCategory (UnicodeCategory.Surrogate, c) ||
- IsCategory (UnicodeCategory.OtherNotAssigned, c);
- // Unicode block ranges...
- case Category.UnicodeBasicLatin:
- return '\u0000' <= c && c <= '\u007F';
- case Category.UnicodeLatin1Supplement:
- return '\u0080' <= c && c <= '\u00FF';
- case Category.UnicodeLatinExtendedA:
- return '\u0100' <= c && c <= '\u017F';
- case Category.UnicodeLatinExtendedB:
- return '\u0180' <= c && c <= '\u024F';
- case Category.UnicodeIPAExtensions:
- return '\u0250' <= c && c <= '\u02AF';
- case Category.UnicodeSpacingModifierLetters:
- return '\u02B0' <= c && c <= '\u02FF';
- case Category.UnicodeCombiningDiacriticalMarks:
- return '\u0300' <= c && c <= '\u036F';
- case Category.UnicodeGreek:
- return '\u0370' <= c && c <= '\u03FF';
- case Category.UnicodeCyrillic:
- return '\u0400' <= c && c <= '\u04FF';
- case Category.UnicodeArmenian:
- return '\u0530' <= c && c <= '\u058F';
- case Category.UnicodeHebrew:
- return '\u0590' <= c && c <= '\u05FF';
- case Category.UnicodeArabic:
- return '\u0600' <= c && c <= '\u06FF';
- case Category.UnicodeSyriac:
- return '\u0700' <= c && c <= '\u074F';
- case Category.UnicodeThaana:
- return '\u0780' <= c && c <= '\u07BF';
- case Category.UnicodeDevanagari:
- return '\u0900' <= c && c <= '\u097F';
- case Category.UnicodeBengali:
- return '\u0980' <= c && c <= '\u09FF';
- case Category.UnicodeGurmukhi:
- return '\u0A00' <= c && c <= '\u0A7F';
- case Category.UnicodeGujarati:
- return '\u0A80' <= c && c <= '\u0AFF';
- case Category.UnicodeOriya:
- return '\u0B00' <= c && c <= '\u0B7F';
- case Category.UnicodeTamil:
- return '\u0B80' <= c && c <= '\u0BFF';
- case Category.UnicodeTelugu:
- return '\u0C00' <= c && c <= '\u0C7F';
- case Category.UnicodeKannada:
- return '\u0C80' <= c && c <= '\u0CFF';
- case Category.UnicodeMalayalam:
- return '\u0D00' <= c && c <= '\u0D7F';
- case Category.UnicodeSinhala:
- return '\u0D80' <= c && c <= '\u0DFF';
- case Category.UnicodeThai:
- return '\u0E00' <= c && c <= '\u0E7F';
- case Category.UnicodeLao:
- return '\u0E80' <= c && c <= '\u0EFF';
- case Category.UnicodeTibetan:
- return '\u0F00' <= c && c <= '\u0FFF';
- case Category.UnicodeMyanmar:
- return '\u1000' <= c && c <= '\u109F';
- case Category.UnicodeGeorgian:
- return '\u10A0' <= c && c <= '\u10FF';
- case Category.UnicodeHangulJamo:
- return '\u1100' <= c && c <= '\u11FF';
- case Category.UnicodeEthiopic:
- return '\u1200' <= c && c <= '\u137F';
- case Category.UnicodeCherokee:
- return '\u13A0' <= c && c <= '\u13FF';
- case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
- return '\u1400' <= c && c <= '\u167F';
- case Category.UnicodeOgham:
- return '\u1680' <= c && c <= '\u169F';
- case Category.UnicodeRunic:
- return '\u16A0' <= c && c <= '\u16FF';
- case Category.UnicodeKhmer:
- return '\u1780' <= c && c <= '\u17FF';
- case Category.UnicodeMongolian:
- return '\u1800' <= c && c <= '\u18AF';
- case Category.UnicodeLatinExtendedAdditional:
- return '\u1E00' <= c && c <= '\u1EFF';
- case Category.UnicodeGreekExtended:
- return '\u1F00' <= c && c <= '\u1FFF';
- case Category.UnicodeGeneralPunctuation:
- return '\u2000' <= c && c <= '\u206F';
- case Category.UnicodeSuperscriptsandSubscripts:
- return '\u2070' <= c && c <= '\u209F';
- case Category.UnicodeCurrencySymbols:
- return '\u20A0' <= c && c <= '\u20CF';
- case Category.UnicodeCombiningMarksforSymbols:
- return '\u20D0' <= c && c <= '\u20FF';
- case Category.UnicodeLetterlikeSymbols:
- return '\u2100' <= c && c <= '\u214F';
- case Category.UnicodeNumberForms:
- return '\u2150' <= c && c <= '\u218F';
- case Category.UnicodeArrows:
- return '\u2190' <= c && c <= '\u21FF';
- case Category.UnicodeMathematicalOperators:
- return '\u2200' <= c && c <= '\u22FF';
- case Category.UnicodeMiscellaneousTechnical:
- return '\u2300' <= c && c <= '\u23FF';
- case Category.UnicodeControlPictures:
- return '\u2400' <= c && c <= '\u243F';
- case Category.UnicodeOpticalCharacterRecognition:
- return '\u2440' <= c && c <= '\u245F';
- case Category.UnicodeEnclosedAlphanumerics:
- return '\u2460' <= c && c <= '\u24FF';
- case Category.UnicodeBoxDrawing:
- return '\u2500' <= c && c <= '\u257F';
- case Category.UnicodeBlockElements:
- return '\u2580' <= c && c <= '\u259F';
- case Category.UnicodeGeometricShapes:
- return '\u25A0' <= c && c <= '\u25FF';
- case Category.UnicodeMiscellaneousSymbols:
- return '\u2600' <= c && c <= '\u26FF';
- case Category.UnicodeDingbats:
- return '\u2700' <= c && c <= '\u27BF';
- case Category.UnicodeBraillePatterns:
- return '\u2800' <= c && c <= '\u28FF';
- case Category.UnicodeCJKRadicalsSupplement:
- return '\u2E80' <= c && c <= '\u2EFF';
- case Category.UnicodeKangxiRadicals:
- return '\u2F00' <= c && c <= '\u2FDF';
- case Category.UnicodeIdeographicDescriptionCharacters:
- return '\u2FF0' <= c && c <= '\u2FFF';
- case Category.UnicodeCJKSymbolsandPunctuation:
- return '\u3000' <= c && c <= '\u303F';
- case Category.UnicodeHiragana:
- return '\u3040' <= c && c <= '\u309F';
- case Category.UnicodeKatakana:
- return '\u30A0' <= c && c <= '\u30FF';
- case Category.UnicodeBopomofo:
- return '\u3100' <= c && c <= '\u312F';
- case Category.UnicodeHangulCompatibilityJamo:
- return '\u3130' <= c && c <= '\u318F';
- case Category.UnicodeKanbun:
- return '\u3190' <= c && c <= '\u319F';
- case Category.UnicodeBopomofoExtended:
- return '\u31A0' <= c && c <= '\u31BF';
- case Category.UnicodeEnclosedCJKLettersandMonths:
- return '\u3200' <= c && c <= '\u32FF';
- case Category.UnicodeCJKCompatibility:
- return '\u3300' <= c && c <= '\u33FF';
- case Category.UnicodeCJKUnifiedIdeographsExtensionA:
- return '\u3400' <= c && c <= '\u4DB5';
- case Category.UnicodeCJKUnifiedIdeographs:
- return '\u4E00' <= c && c <= '\u9FFF';
- case Category.UnicodeYiSyllables:
- return '\uA000' <= c && c <= '\uA48F';
- case Category.UnicodeYiRadicals:
- return '\uA490' <= c && c <= '\uA4CF';
- case Category.UnicodeHangulSyllables:
- return '\uAC00' <= c && c <= '\uD7A3';
- case Category.UnicodeHighSurrogates:
- return '\uD800' <= c && c <= '\uDB7F';
- case Category.UnicodeHighPrivateUseSurrogates:
- return '\uDB80' <= c && c <= '\uDBFF';
- case Category.UnicodeLowSurrogates:
- return '\uDC00' <= c && c <= '\uDFFF';
- case Category.UnicodePrivateUse:
- return '\uE000' <= c && c <= '\uF8FF';
- case Category.UnicodeCJKCompatibilityIdeographs:
- return '\uF900' <= c && c <= '\uFAFF';
- case Category.UnicodeAlphabeticPresentationForms:
- return '\uFB00' <= c && c <= '\uFB4F';
- case Category.UnicodeArabicPresentationFormsA:
- return '\uFB50' <= c && c <= '\uFDFF';
- case Category.UnicodeCombiningHalfMarks:
- return '\uFE20' <= c && c <= '\uFE2F';
- case Category.UnicodeCJKCompatibilityForms:
- return '\uFE30' <= c && c <= '\uFE4F';
- case Category.UnicodeSmallFormVariants:
- return '\uFE50' <= c && c <= '\uFE6F';
- case Category.UnicodeArabicPresentationFormsB:
- return '\uFE70' <= c && c <= '\uFEFE';
- case Category.UnicodeHalfwidthandFullwidthForms:
- return '\uFF00' <= c && c <= '\uFFEF';
- case Category.UnicodeSpecials:
- return
- '\uFEFF' <= c && c <= '\uFEFF' ||
- '\uFFF0' <= c && c <= '\uFFFD';
- // these block ranges begin above 0x10000
- case Category.UnicodeOldItalic:
- case Category.UnicodeGothic:
- case Category.UnicodeDeseret:
- case Category.UnicodeByzantineMusicalSymbols:
- case Category.UnicodeMusicalSymbols:
- case Category.UnicodeMathematicalAlphanumericSymbols:
- case Category.UnicodeCJKUnifiedIdeographsExtensionB:
- case Category.UnicodeCJKCompatibilityIdeographsSupplement:
- case Category.UnicodeTags:
- return false;
- default:
- return false;
- }
- }
- private static bool IsCategory (UnicodeCategory uc, char c) {
- if (Char.GetUnicodeCategory (c) == uc)
- return true;
- return false;
- }
- }
- }
|