Unicode.h 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. //===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file defines functions that allow querying certain properties of Unicode
  11. // characters.
  12. //
  13. //===----------------------------------------------------------------------===//
  14. #ifndef LLVM_SUPPORT_UNICODE_H
  15. #define LLVM_SUPPORT_UNICODE_H
  16. #include "llvm/ADT/StringRef.h"
  17. namespace llvm {
  18. namespace sys {
  19. namespace unicode {
  20. enum ColumnWidthErrors {
  21. ErrorInvalidUTF8 = -2,
  22. ErrorNonPrintableCharacter = -1
  23. };
  24. /// Determines if a character is likely to be displayed correctly on the
  25. /// terminal. Exact implementation would have to depend on the specific
  26. /// terminal, so we define the semantic that should be suitable for generic case
  27. /// of a terminal capable to output Unicode characters.
  28. ///
  29. /// All characters from the Unicode code point range are considered printable
  30. /// except for:
  31. /// * C0 and C1 control character ranges;
  32. /// * default ignorable code points as per 5.21 of
  33. /// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
  34. /// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
  35. /// terminals;
  36. /// * format characters (category = Cf);
  37. /// * surrogates (category = Cs);
  38. /// * unassigned characters (category = Cn).
  39. /// \return true if the character is considered printable.
  40. bool isPrintable(int UCS);
  41. /// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
  42. /// when output on a terminal ("character width"). This depends on the
  43. /// implementation of the terminal, and there's no standard definition of
  44. /// character width.
  45. ///
  46. /// The implementation defines it in a way that is expected to be compatible
  47. /// with a generic Unicode-capable terminal.
  48. ///
  49. /// \return Character width:
  50. /// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
  51. /// characters (as identified by isPrintable);
  52. /// * 0 for each non-spacing and enclosing combining mark;
  53. /// * 2 for each CJK character excluding halfwidth forms;
  54. /// * 1 for each of the remaining characters.
  55. int columnWidthUTF8(StringRef Text);
  56. } // namespace unicode
  57. } // namespace sys
  58. } // namespace llvm
  59. #endif