UnicodeString.hx 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /*
  2. * Copyright (C)2005-2019 Haxe Foundation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. * DEALINGS IN THE SOFTWARE.
  21. */
  22. import haxe.io.Bytes;
  23. import haxe.io.Encoding;
  24. import haxe.iterators.StringIteratorUnicode;
  25. import haxe.iterators.StringKeyValueIteratorUnicode;
  26. /**
  27. This abstract provides consistent cross-target unicode support.
  28. @see https://haxe.org/manual/std-UnicodeString.html
  29. **/
  30. @:forward
  31. @:access(StringTools)
  32. abstract UnicodeString(String) from String to String {
  33. /**
  34. Tells if `b` is a correctly encoded UTF8 byte sequence.
  35. **/
  36. static public function validate(b:Bytes, encoding:Encoding) : Bool {
  37. switch(encoding) {
  38. case RawNative: throw "UnicodeString.validate: RawNative encoding is not supported";
  39. case UTF8:
  40. var data = b.getData();
  41. var pos = 0;
  42. var max = b.length;
  43. while( pos < max) {
  44. var c:Int = Bytes.fastGet(data, pos++);
  45. if(c < 0x80) {
  46. } else if(c < 0xC2) {
  47. return false;
  48. } else if(c < 0xE0) {
  49. if(pos + 1 > max) {
  50. return false;
  51. }
  52. var c2:Int = Bytes.fastGet(data, pos++);
  53. if(c2 < 0x80 || c2 > 0xBF) {
  54. return false;
  55. }
  56. } else if(c < 0xF0) {
  57. if(pos + 2 > max) {
  58. return false;
  59. }
  60. var c2:Int = Bytes.fastGet(data, pos++);
  61. if(c == 0xE0) {
  62. if(c2 < 0xA0 || c2 > 0xBF) return false;
  63. } else {
  64. if(c2 < 0x80 || c2 > 0xBF) return false;
  65. }
  66. var c3:Int = Bytes.fastGet(data, pos++);
  67. if(c3 < 0x80 || c3 > 0xBF) {
  68. return false;
  69. }
  70. c = (c << 16) | (c2 << 8) | c3;
  71. if(0xEDA080 <= c && c <= 0xEDBFBF) { //surrogate pairs
  72. return false;
  73. }
  74. } else if(c > 0xF4) {
  75. return false;
  76. } else {
  77. if(pos + 3 > max) {
  78. return false;
  79. }
  80. var c2:Int = Bytes.fastGet(data, pos++);
  81. if(c == 0xF0) {
  82. if(c2 < 0x90 || c2 > 0xBF) return false;
  83. } else if(c == 0xF4) {
  84. if(c2 < 0x80 || c2 > 0x8F) return false;
  85. } else {
  86. if(c2 < 0x80 || c2 > 0xBF) return false;
  87. }
  88. var c3:Int = Bytes.fastGet(data, pos++);
  89. if(c3 < 0x80 || c3 > 0xBF) {
  90. return false;
  91. }
  92. var c4:Int = Bytes.fastGet(data, pos++);
  93. if(c4 < 0x80 || c4 > 0xBF) {
  94. return false;
  95. }
  96. }
  97. }
  98. return true;
  99. }
  100. }
  101. #if target.unicode
  102. /**
  103. The number of characters in `this` String.
  104. **/
  105. public var length(get,never):Int;
  106. /**
  107. Creates an instance of UnicodeString.
  108. **/
  109. public inline function new(string:String):Void {
  110. this = string;
  111. }
  112. /**
  113. Returns the character at position `index` of `this` String.
  114. If `index` is negative or exceeds `this.length`, the empty String `""`
  115. is returned.
  116. **/
  117. #if !utf16 inline #end
  118. public function charAt(index:Int):String {
  119. #if utf16
  120. if(index < 0) return '';
  121. var unicodeOffset = 0;
  122. var nativeOffset = 0;
  123. while(nativeOffset < this.length) {
  124. var c = StringTools.utf16CodePointAt(this, nativeOffset++);
  125. if(unicodeOffset == index) {
  126. return String.fromCharCode(c);
  127. }
  128. if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  129. nativeOffset++;
  130. }
  131. unicodeOffset++;
  132. }
  133. return '';
  134. #else
  135. return this.charAt(index);
  136. #end
  137. }
  138. /**
  139. Returns the character code at position `index` of `this` String.
  140. If `index` is negative or exceeds `this.length`, `null` is returned.
  141. **/
  142. #if !utf16 inline #end
  143. public function charCodeAt(index:Int):Null<Int> {
  144. #if utf16
  145. if(index < 0) return null;
  146. var unicodeOffset = 0;
  147. var nativeOffset = 0;
  148. while(nativeOffset < this.length) {
  149. var c = StringTools.utf16CodePointAt(this, nativeOffset++);
  150. if(unicodeOffset == index) {
  151. return c;
  152. }
  153. if(c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  154. nativeOffset++;
  155. }
  156. unicodeOffset++;
  157. }
  158. return null;
  159. #else
  160. return this.charCodeAt(index);
  161. #end
  162. }
  163. /**
  164. Returns an iterator of the unicode code points.
  165. **/
  166. public inline function iterator():StringIteratorUnicode {
  167. return new StringIteratorUnicode(this);
  168. }
  169. /**
  170. Returns an iterator of the code point indices and unicode code points.
  171. **/
  172. public inline function keyValueIterator():StringKeyValueIteratorUnicode {
  173. return new StringKeyValueIteratorUnicode(this);
  174. }
  175. #if !utf16 inline #end
  176. function get_length():Int {
  177. #if utf16
  178. var l = 0;
  179. for(c in new StringIteratorUnicode(this)) {
  180. l++;
  181. }
  182. return l;
  183. #else
  184. return this.length;
  185. #end
  186. }
  187. #end
  188. @:op(A < B) static function lt(a:UnicodeString, b:UnicodeString):Bool;
  189. @:op(A <= B) static function lte(a:UnicodeString, b:UnicodeString):Bool;
  190. @:op(A > B) static function gt(a:UnicodeString, b:UnicodeString):Bool;
  191. @:op(A >= B) static function gte(a:UnicodeString, b:UnicodeString):Bool;
  192. @:op(A == B) static function eq(a:UnicodeString, b:UnicodeString):Bool;
  193. @:op(A != B) static function neq(a:UnicodeString, b:UnicodeString):Bool;
  194. @:op(A + B) static function add(a:UnicodeString, b:UnicodeString):UnicodeString;
  195. @:op(A += B) static function assignAdd(a:UnicodeString, b:UnicodeString):UnicodeString;
  196. @:op(A + B) @:commutative static function add(a:UnicodeString, b:String):UnicodeString;
  197. @:op(A += B) @:commutative static function assignAdd(a:UnicodeString, b:String):UnicodeString;
  198. }