2
0

UnicodeString.hx 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. /*
  2. * Copyright (C)2005-2019 Haxe Foundation
  3. *
  4. * Permission is hereby granted, free of charge, to any person obtaining a
  5. * copy of this software and associated documentation files (the "Software"),
  6. * to deal in the Software without restriction, including without limitation
  7. * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8. * and/or sell copies of the Software, and to permit persons to whom the
  9. * Software is furnished to do so, subject to the following conditions:
  10. *
  11. * The above copyright notice and this permission notice shall be included in
  12. * all copies or substantial portions of the Software.
  13. *
  14. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20. * DEALINGS IN THE SOFTWARE.
  21. */
  22. import haxe.io.Bytes;
  23. import haxe.io.Encoding;
  24. import haxe.iterators.StringIteratorUnicode;
  25. import haxe.iterators.StringKeyValueIteratorUnicode;
  26. /**
  27. This abstract provides consistent cross-target unicode support.
  28. @see https://haxe.org/manual/std-UnicodeString.html
  29. **/
  30. @:forward
  31. @:access(StringTools)
  32. abstract UnicodeString(String) from String to String {
  33. /**
  34. Tells if `b` is a correctly encoded UTF8 byte sequence.
  35. **/
  36. static public function validate(b:Bytes, encoding:Encoding):Bool {
  37. switch (encoding) {
  38. case RawNative:
  39. throw "UnicodeString.validate: RawNative encoding is not supported";
  40. case UTF8:
  41. var data = b.getData();
  42. var pos = 0;
  43. var max = b.length;
  44. while (pos < max) {
  45. var c:Int = Bytes.fastGet(data, pos++);
  46. if (c < 0x80) {} else if (c < 0xC2) {
  47. return false;
  48. } else if (c < 0xE0) {
  49. if (pos + 1 > max) {
  50. return false;
  51. }
  52. var c2:Int = Bytes.fastGet(data, pos++);
  53. if (c2 < 0x80 || c2 > 0xBF) {
  54. return false;
  55. }
  56. } else if (c < 0xF0) {
  57. if (pos + 2 > max) {
  58. return false;
  59. }
  60. var c2:Int = Bytes.fastGet(data, pos++);
  61. if (c == 0xE0) {
  62. if (c2 < 0xA0 || c2 > 0xBF)
  63. return false;
  64. } else {
  65. if (c2 < 0x80 || c2 > 0xBF)
  66. return false;
  67. }
  68. var c3:Int = Bytes.fastGet(data, pos++);
  69. if (c3 < 0x80 || c3 > 0xBF) {
  70. return false;
  71. }
  72. c = (c << 16) | (c2 << 8) | c3;
  73. if (0xEDA080 <= c && c <= 0xEDBFBF) { // surrogate pairs
  74. return false;
  75. }
  76. } else if (c > 0xF4) {
  77. return false;
  78. } else {
  79. if (pos + 3 > max) {
  80. return false;
  81. }
  82. var c2:Int = Bytes.fastGet(data, pos++);
  83. if (c == 0xF0) {
  84. if (c2 < 0x90 || c2 > 0xBF)
  85. return false;
  86. } else if (c == 0xF4) {
  87. if (c2 < 0x80 || c2 > 0x8F)
  88. return false;
  89. } else {
  90. if (c2 < 0x80 || c2 > 0xBF)
  91. return false;
  92. }
  93. var c3:Int = Bytes.fastGet(data, pos++);
  94. if (c3 < 0x80 || c3 > 0xBF) {
  95. return false;
  96. }
  97. var c4:Int = Bytes.fastGet(data, pos++);
  98. if (c4 < 0x80 || c4 > 0xBF) {
  99. return false;
  100. }
  101. }
  102. }
  103. return true;
  104. }
  105. }
  106. #if target.unicode
  107. /**
  108. Creates an instance of UnicodeString.
  109. **/
  110. public inline function new(string:String):Void {
  111. this = string;
  112. }
  113. /**
  114. Returns an iterator of the unicode code points.
  115. **/
  116. public inline function iterator():StringIteratorUnicode {
  117. return new StringIteratorUnicode(this);
  118. }
  119. /**
  120. Returns an iterator of the code point indices and unicode code points.
  121. **/
  122. public inline function keyValueIterator():StringKeyValueIteratorUnicode {
  123. return new StringKeyValueIteratorUnicode(this);
  124. }
  125. #if target.utf16
  126. /**
  127. The number of characters in `this` String.
  128. **/
  129. public var length(get, never):Int;
  130. /**
  131. Returns the character at position `index` of `this` String.
  132. If `index` is negative or exceeds `this.length`, the empty String `""`
  133. is returned.
  134. **/
  135. public function charAt(index:Int):String {
  136. if (index < 0)
  137. return '';
  138. var unicodeOffset = 0;
  139. var nativeOffset = 0;
  140. while (nativeOffset < this.length) {
  141. var c = StringTools.utf16CodePointAt(this, nativeOffset++);
  142. if (unicodeOffset == index) {
  143. return String.fromCharCode(c);
  144. }
  145. if (c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  146. nativeOffset++;
  147. }
  148. unicodeOffset++;
  149. }
  150. return '';
  151. }
  152. /**
  153. Returns the character code at position `index` of `this` String.
  154. If `index` is negative or exceeds `this.length`, `null` is returned.
  155. **/
  156. public function charCodeAt(index:Int):Null<Int> {
  157. if (index < 0)
  158. return null;
  159. var unicodeOffset = 0;
  160. var nativeOffset = 0;
  161. while (nativeOffset < this.length) {
  162. var c = StringTools.utf16CodePointAt(this, nativeOffset++);
  163. if (unicodeOffset == index) {
  164. return c;
  165. }
  166. if (c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  167. nativeOffset++;
  168. }
  169. unicodeOffset++;
  170. }
  171. return null;
  172. }
  173. /**
  174. Returns the position of the leftmost occurrence of `str` within `this`
  175. String.
  176. If `startIndex` is given, the search is performed within the substring
  177. of `this` String starting from `startIndex` (if `startIndex` is posivite
  178. or 0) or `max(this.length + startIndex, 0)` (if `startIndex` is negative).
  179. If `startIndex` exceeds `this.length`, -1 is returned.
  180. Otherwise the search is performed within `this` String. In either case,
  181. the returned position is relative to the beginning of `this` String.
  182. If `str` cannot be found, -1 is returned.
  183. **/
  184. public function indexOf(str:String, ?startIndex:Int):Int {
  185. if (startIndex == null) {
  186. startIndex = 0;
  187. } else {
  188. if (startIndex < 0) {
  189. startIndex = (this : UnicodeString).length + startIndex;
  190. }
  191. }
  192. var unicodeOffset = 0;
  193. var nativeOffset = 0;
  194. var matchingOffset = 0;
  195. var result = -1;
  196. while (nativeOffset <= this.length) {
  197. var c = StringTools.utf16CodePointAt(this, nativeOffset);
  198. if (unicodeOffset >= startIndex) {
  199. var c2 = StringTools.utf16CodePointAt(str, matchingOffset);
  200. if (c == c2) {
  201. if (matchingOffset == 0) {
  202. result = unicodeOffset;
  203. }
  204. matchingOffset++;
  205. if (c2 >= StringTools.MIN_SURROGATE_CODE_POINT) {
  206. matchingOffset++;
  207. }
  208. if (matchingOffset == str.length) {
  209. return result;
  210. }
  211. } else if (matchingOffset != 0) {
  212. result = -1;
  213. matchingOffset = 0;
  214. continue;
  215. }
  216. }
  217. nativeOffset++;
  218. if (c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  219. nativeOffset++;
  220. }
  221. unicodeOffset++;
  222. }
  223. return -1;
  224. }
  225. /**
  226. Returns the position of the rightmost occurrence of `str` within `this`
  227. String.
  228. If `startIndex` is given, the search is performed within the substring
  229. of `this` String from 0 to `startIndex + str.length`. Otherwise the search
  230. is performed within `this` String. In either case, the returned position
  231. is relative to the beginning of `this` String.
  232. If `str` cannot be found, -1 is returned.
  233. **/
  234. public function lastIndexOf(str:String, ?startIndex:Int):Int {
  235. if (startIndex == null) {
  236. startIndex = this.length;
  237. } else if (startIndex < 0) {
  238. startIndex = 0;
  239. }
  240. var unicodeOffset = 0;
  241. var nativeOffset = 0;
  242. var result = -1;
  243. var lastIndex = -1;
  244. var matchingOffset = 0;
  245. var strUnicodeLength = (str : UnicodeString).length;
  246. while (nativeOffset < this.length && unicodeOffset < startIndex + strUnicodeLength) {
  247. var c = StringTools.utf16CodePointAt(this, nativeOffset);
  248. var c2 = StringTools.utf16CodePointAt(str, matchingOffset);
  249. if (c == c2) {
  250. if (matchingOffset == 0) {
  251. lastIndex = unicodeOffset;
  252. }
  253. matchingOffset++;
  254. if (c2 >= StringTools.MIN_SURROGATE_CODE_POINT) {
  255. matchingOffset++;
  256. }
  257. if (matchingOffset == str.length) {
  258. result = lastIndex;
  259. lastIndex = -1;
  260. }
  261. } else if (matchingOffset != 0) {
  262. lastIndex = -1;
  263. matchingOffset = 0;
  264. continue;
  265. }
  266. nativeOffset++;
  267. if (c >= StringTools.MIN_SURROGATE_CODE_POINT) {
  268. nativeOffset++;
  269. }
  270. unicodeOffset++;
  271. }
  272. return result;
  273. }
  274. /**
  275. Returns `len` characters of `this` String, starting at position `pos`.
  276. If `len` is omitted, all characters from position `pos` to the end of
  277. `this` String are included.
  278. If `pos` is negative, its value is calculated from the end of `this`
  279. String by `this.length + pos`. If this yields a negative value, 0 is
  280. used instead.
  281. If the calculated position + `len` exceeds `this.length`, the characters
  282. from that position to the end of `this` String are returned.
  283. If `len` is negative, the result is unspecified.
  284. **/
  285. public function substr(pos:Int, ?len:Int):String {
  286. if (pos < 0) {
  287. pos = (this : UnicodeString).length + pos;
  288. if (pos < 0) {
  289. pos = 0;
  290. }
  291. }
  292. if (len != null) {
  293. if (len < 0) {
  294. len = (this : UnicodeString).length + len;
  295. }
  296. if (len <= 0) {
  297. return "";
  298. }
  299. }
  300. var unicodeOffset = 0;
  301. var nativeOffset = 0;
  302. var fromOffset = -1;
  303. var subLength = 0;
  304. while (nativeOffset < this.length) {
  305. var c = StringTools.utf16CodePointAt(this, nativeOffset);
  306. if (unicodeOffset >= pos) {
  307. if (fromOffset < 0) {
  308. if (len == null) {
  309. return this.substr(nativeOffset);
  310. }
  311. fromOffset = nativeOffset;
  312. }
  313. subLength++;
  314. if (subLength >= len) {
  315. var lastOffset = (c < StringTools.MIN_SURROGATE_CODE_POINT ? nativeOffset : nativeOffset + 1);
  316. return this.substr(fromOffset, lastOffset - fromOffset + 1);
  317. }
  318. }
  319. nativeOffset += (c >= StringTools.MIN_SURROGATE_CODE_POINT ? 2 : 1);
  320. unicodeOffset++;
  321. }
  322. return (fromOffset < 0 ? "" : this.substr(fromOffset));
  323. }
  324. /**
  325. Returns the part of `this` String from `startIndex` to but not including `endIndex`.
  326. If `startIndex` or `endIndex` are negative, 0 is used instead.
  327. If `startIndex` exceeds `endIndex`, they are swapped.
  328. If the (possibly swapped) `endIndex` is omitted or exceeds
  329. `this.length`, `this.length` is used instead.
  330. If the (possibly swapped) `startIndex` exceeds `this.length`, the empty
  331. String `""` is returned.
  332. **/
  333. public function substring(startIndex:Int, ?endIndex:Int):String {
  334. if (startIndex < 0) {
  335. startIndex = 0;
  336. }
  337. if (endIndex != null) {
  338. if (endIndex < 0) {
  339. endIndex = 0;
  340. }
  341. if (startIndex == endIndex) {
  342. return "";
  343. }
  344. if (startIndex > endIndex) {
  345. var tmp = startIndex;
  346. startIndex = endIndex;
  347. endIndex = tmp;
  348. }
  349. }
  350. var unicodeOffset = 0;
  351. var nativeOffset = 0;
  352. var fromOffset = -1;
  353. var subLength = 0;
  354. while (nativeOffset < this.length) {
  355. var c = StringTools.utf16CodePointAt(this, nativeOffset);
  356. if (startIndex <= unicodeOffset) {
  357. if (fromOffset < 0) {
  358. if (endIndex == null) {
  359. return this.substr(nativeOffset);
  360. }
  361. fromOffset = nativeOffset;
  362. }
  363. subLength++;
  364. if (subLength >= endIndex - startIndex) {
  365. var lastOffset = (c < StringTools.MIN_SURROGATE_CODE_POINT ? nativeOffset : nativeOffset + 1);
  366. return this.substr(fromOffset, lastOffset - fromOffset + 1);
  367. }
  368. }
  369. nativeOffset += (c >= StringTools.MIN_SURROGATE_CODE_POINT ? 2 : 1);
  370. unicodeOffset++;
  371. }
  372. return (fromOffset < 0 ? "" : this.substr(fromOffset));
  373. }
  374. function get_length():Int {
  375. var l = 0;
  376. for (c in new StringIteratorUnicode(this)) {
  377. l++;
  378. }
  379. return l;
  380. }
  381. #end
  382. #end
  383. @:op(A < B) static function lt(a:UnicodeString, b:UnicodeString):Bool;
  384. @:op(A <= B) static function lte(a:UnicodeString, b:UnicodeString):Bool;
  385. @:op(A > B) static function gt(a:UnicodeString, b:UnicodeString):Bool;
  386. @:op(A >= B) static function gte(a:UnicodeString, b:UnicodeString):Bool;
  387. @:op(A == B) static function eq(a:UnicodeString, b:UnicodeString):Bool;
  388. @:op(A != B) static function neq(a:UnicodeString, b:UnicodeString):Bool;
  389. @:op(A + B) static function add(a:UnicodeString, b:UnicodeString):UnicodeString;
  390. @:op(A += B) static function assignAdd(a:UnicodeString, b:UnicodeString):UnicodeString;
  391. @:op(A + B) @:commutative static function add(a:UnicodeString, b:String):UnicodeString;
  392. @:op(A += B) @:commutative static function assignAdd(a:UnicodeString, b:String):UnicodeString;
  393. }