uTF8.ml 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. (*
  2. * UTF-8 - UTF-8 encoded Unicode string
  3. * Copyright 2002, 2003 (C) Yamagata Yoriyuki.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2.1 of the License, or (at your option) any later version,
  9. * with the special exception on linking described in file LICENSE.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  19. *)
  20. open UCharExt
  21. type t = string
  22. type index = int
  23. let look s i =
  24. let n' =
  25. let n = Char.code s.[i] in
  26. if n < 0x80 then n else
  27. if n <= 0xdf then
  28. (n - 0xc0) lsl 6 lor (0x7f land (Char.code s.[i + 1]))
  29. else if n <= 0xef then
  30. let n' = n - 0xe0 in
  31. let m0 = Char.code s.[i + 2] in
  32. let m = Char.code (String.unsafe_get s (i + 1)) in
  33. let n' = n' lsl 6 lor (0x7f land m) in
  34. n' lsl 6 lor (0x7f land m0)
  35. else if n <= 0xf7 then
  36. let n' = n - 0xf0 in
  37. let m0 = Char.code s.[i + 3] in
  38. let m = Char.code (String.unsafe_get s (i + 1)) in
  39. let n' = n' lsl 6 lor (0x7f land m) in
  40. let m = Char.code (String.unsafe_get s (i + 2)) in
  41. let n' = n' lsl 6 lor (0x7f land m) in
  42. n' lsl 6 lor (0x7f land m0)
  43. else if n <= 0xfb then
  44. let n' = n - 0xf8 in
  45. let m0 = Char.code s.[i + 4] in
  46. let m = Char.code (String.unsafe_get s (i + 1)) in
  47. let n' = n' lsl 6 lor (0x7f land m) in
  48. let m = Char.code (String.unsafe_get s (i + 2)) in
  49. let n' = n' lsl 6 lor (0x7f land m) in
  50. let m = Char.code (String.unsafe_get s (i + 3)) in
  51. let n' = n' lsl 6 lor (0x7f land m) in
  52. n' lsl 6 lor (0x7f land m0)
  53. else if n <= 0xfd then
  54. let n' = n - 0xfc in
  55. let m0 = Char.code s.[i + 5] in
  56. let m = Char.code (String.unsafe_get s (i + 1)) in
  57. let n' = n' lsl 6 lor (0x7f land m) in
  58. let m = Char.code (String.unsafe_get s (i + 2)) in
  59. let n' = n' lsl 6 lor (0x7f land m) in
  60. let m = Char.code (String.unsafe_get s (i + 3)) in
  61. let n' = n' lsl 6 lor (0x7f land m) in
  62. let m = Char.code (String.unsafe_get s (i + 4)) in
  63. let n' = n' lsl 6 lor (0x7f land m) in
  64. n' lsl 6 lor (0x7f land m0)
  65. else invalid_arg "UTF8.look"
  66. in
  67. Obj.magic n'
  68. let rec search_head s i =
  69. if i >= String.length s then i else
  70. let n = Char.code (String.unsafe_get s i) in
  71. if n < 0x80 || n >= 0xc2 then i else
  72. search_head s (i + 1)
  73. let next s i =
  74. let n = Char.code s.[i] in
  75. if n < 0x80 then i + 1 else
  76. if n < 0xc0 then search_head s (i + 1) else
  77. if n <= 0xdf then i + 2
  78. else if n <= 0xef then i + 3
  79. else if n <= 0xf7 then i + 4
  80. else if n <= 0xfb then i + 5
  81. else if n <= 0xfd then i + 6
  82. else invalid_arg "UTF8.next"
  83. let rec search_head_backward s i =
  84. if i < 0 then -1 else
  85. let n = Char.code s.[i] in
  86. if n < 0x80 || n >= 0xc2 then i else
  87. search_head_backward s (i - 1)
  88. let prev s i = search_head_backward s (i - 1)
  89. let move s i n =
  90. if n >= 0 then
  91. let rec loop i n = if n <= 0 then i else loop (next s i) (n - 1) in
  92. loop i n
  93. else
  94. let rec loop i n = if n >= 0 then i else loop (prev s i) (n + 1) in
  95. loop i n
  96. let rec nth_aux s i n =
  97. if n = 0 then i else
  98. nth_aux s (next s i) (n - 1)
  99. let nth s n = nth_aux s 0 n
  100. let last s = search_head_backward s (String.length s - 1)
  101. let out_of_range s i = i < 0 || i >= String.length s
  102. let compare_index _ i j = i - j
  103. let get s n = look s (nth s n)
  104. let add_uchar buf u =
  105. let masq = 0b111111 in
  106. let k = int_of_uchar u in
  107. if k < 0 || k >= 0x4000000 then begin
  108. Buffer.add_char buf (Char.chr (0xfc + (k lsr 30)));
  109. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 24) land masq)));
  110. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
  111. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
  112. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
  113. Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
  114. end else if k <= 0x7f then
  115. Buffer.add_char buf (Char.unsafe_chr k)
  116. else if k <= 0x7ff then begin
  117. Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (k lsr 6)));
  118. Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)))
  119. end else if k <= 0xffff then begin
  120. Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (k lsr 12)));
  121. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
  122. Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
  123. end else if k <= 0x1fffff then begin
  124. Buffer.add_char buf (Char.unsafe_chr (0xf0 + (k lsr 18)));
  125. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
  126. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
  127. Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
  128. end else begin
  129. Buffer.add_char buf (Char.unsafe_chr (0xf8 + (k lsr 24)));
  130. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 18) land masq)));
  131. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 12) land masq)));
  132. Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((k lsr 6) land masq)));
  133. Buffer.add_char buf (Char.unsafe_chr (0x80 lor (k land masq)));
  134. end
  135. let init len f =
  136. let buf = Buffer.create len in
  137. for c = 0 to len - 1 do add_uchar buf (f c) done;
  138. Buffer.contents buf
  139. let rec length_aux s c i =
  140. if i >= String.length s then c else
  141. let n = Char.code (String.unsafe_get s i) in
  142. let k =
  143. if n < 0x80 then 1 else
  144. if n < 0xc0 then invalid_arg "UTF8.length" else
  145. if n < 0xe0 then 2 else
  146. if n < 0xf0 then 3 else
  147. if n < 0xf8 then 4 else
  148. if n < 0xfc then 5 else
  149. if n < 0xfe then 6 else
  150. invalid_arg "UTF8.length" in
  151. length_aux s (c + 1) (i + k)
  152. let length s = length_aux s 0 0
  153. let rec iter_aux proc s i =
  154. if i >= String.length s then () else
  155. let u = look s i in
  156. proc u;
  157. iter_aux proc s (next s i)
  158. let iter proc s = iter_aux proc s 0
  159. let compare s1 s2 = Pervasives.compare s1 s2
  160. exception Malformed_code
  161. let validate s =
  162. let rec trail c i a =
  163. if c = 0 then a else
  164. if i >= String.length s then raise Malformed_code else
  165. let n = Char.code (String.unsafe_get s i) in
  166. if n < 0x80 || n >= 0xc0 then raise Malformed_code else
  167. trail (c - 1) (i + 1) (a lsl 6 lor (n - 0x80)) in
  168. let rec main i =
  169. if i >= String.length s then () else
  170. let n = Char.code (String.unsafe_get s i) in
  171. if n < 0x80 then main (i + 1) else
  172. if n < 0xc2 then raise Malformed_code else
  173. if n <= 0xdf then
  174. if trail 1 (i + 1) (n - 0xc0) < 0x80 then raise Malformed_code else
  175. main (i + 2)
  176. else if n <= 0xef then
  177. if trail 2 (i + 1) (n - 0xe0) < 0x800 then raise Malformed_code else
  178. main (i + 3)
  179. else if n <= 0xf7 then
  180. if trail 3 (i + 1) (n - 0xf0) < 0x10000 then raise Malformed_code else
  181. main (i + 4)
  182. else if n <= 0xfb then
  183. if trail 4 (i + 1) (n - 0xf8) < 0x200000 then raise Malformed_code else
  184. main (i + 5)
  185. else if n <= 0xfd then
  186. let n = trail 5 (i + 1) (n - 0xfc) in
  187. if n lsr 16 < 0x400 then raise Malformed_code else
  188. main (i + 6)
  189. else raise Malformed_code in
  190. main 0
  191. module Buf =
  192. struct
  193. include Buffer
  194. type buf = t
  195. let add_char = add_uchar
  196. end