UnicodeCharRanges.h 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. //===--- UnicodeCharRanges.h - Types and functions for character ranges ---===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. #ifndef LLVM_SUPPORT_UNICODECHARRANGES_H
  10. #define LLVM_SUPPORT_UNICODECHARRANGES_H
  11. #include "llvm/ADT/ArrayRef.h"
  12. #include "llvm/ADT/SmallPtrSet.h"
  13. #include "llvm/Support/Compiler.h"
  14. #include "llvm/Support/Debug.h"
  15. #include "llvm/Support/Mutex.h"
  16. #include "llvm/Support/MutexGuard.h"
  17. #include "llvm/Support/raw_ostream.h"
  18. #include <algorithm>
  19. namespace llvm {
  20. namespace sys {
  21. #define DEBUG_TYPE "unicode"
  22. /// \brief Represents a closed range of Unicode code points [Lower, Upper].
  23. struct UnicodeCharRange {
  24. uint32_t Lower;
  25. uint32_t Upper;
  26. };
  27. inline bool operator<(uint32_t Value, UnicodeCharRange Range) {
  28. return Value < Range.Lower;
  29. }
  30. inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
  31. return Range.Upper < Value;
  32. }
  33. /// \brief Holds a reference to an ordered array of UnicodeCharRange and allows
  34. /// to quickly check if a code point is contained in the set represented by this
  35. /// array.
  36. class UnicodeCharSet {
  37. public:
  38. typedef ArrayRef<UnicodeCharRange> CharRanges;
  39. /// \brief Constructs a UnicodeCharSet instance from an array of
  40. /// UnicodeCharRanges.
  41. ///
  42. /// Array pointed by \p Ranges should have the lifetime at least as long as
  43. /// the UnicodeCharSet instance, and should not change. Array is validated by
  44. /// the constructor, so it makes sense to create as few UnicodeCharSet
  45. /// instances per each array of ranges, as possible.
  46. #ifdef NDEBUG
  47. LLVM_CONSTEXPR UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {}
  48. #else
  49. UnicodeCharSet(CharRanges Ranges) : Ranges(Ranges) {
  50. assert(rangesAreValid());
  51. }
  52. #endif
  53. /// \brief Returns true if the character set contains the Unicode code point
  54. /// \p C.
  55. bool contains(uint32_t C) const {
  56. return std::binary_search(Ranges.begin(), Ranges.end(), C);
  57. }
  58. private:
  59. /// \brief Returns true if each of the ranges is a proper closed range
  60. /// [min, max], and if the ranges themselves are ordered and non-overlapping.
  61. bool rangesAreValid() const {
  62. uint32_t Prev = 0;
  63. for (CharRanges::const_iterator I = Ranges.begin(), E = Ranges.end();
  64. I != E; ++I) {
  65. if (I != Ranges.begin() && Prev >= I->Lower) {
  66. DEBUG(dbgs() << "Upper bound 0x");
  67. DEBUG(dbgs().write_hex(Prev));
  68. DEBUG(dbgs() << " should be less than succeeding lower bound 0x");
  69. DEBUG(dbgs().write_hex(I->Lower) << "\n");
  70. return false;
  71. }
  72. if (I->Upper < I->Lower) {
  73. DEBUG(dbgs() << "Upper bound 0x");
  74. DEBUG(dbgs().write_hex(I->Lower));
  75. DEBUG(dbgs() << " should not be less than lower bound 0x");
  76. DEBUG(dbgs().write_hex(I->Upper) << "\n");
  77. return false;
  78. }
  79. Prev = I->Upper;
  80. }
  81. return true;
  82. }
  83. const CharRanges Ranges;
  84. };
  85. #undef DEBUG_TYPE // "unicode"
  86. } // namespace sys
  87. } // namespace llvm
  88. #endif // LLVM_SUPPORT_UNICODECHARRANGES_H