uTF8.mli 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. (*
  2. * UTF-8 - UTF-8 encoded Unicode string
  3. * Copyright 2002, 2003 (C) Yamagata Yoriyuki.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2.1 of the License, or (at your option) any later version,
  9. * with the special exception on linking described in file LICENSE.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  19. *)
  20. (** UTF-8 encoded Unicode strings.
  21. The Module for UTF-8 encoded Unicode strings.
  22. *)
  23. open UChar
  24. (** UTF-8 encoded Unicode strings. the type is normal string. *)
  25. type t = string
  26. exception Malformed_code
  27. (** [validate s]
  28. Succeeds if s is valid UTF-8, otherwise raises Malformed_code.
  29. Other functions assume strings are valid UTF-8, so it is prudent
  30. to test their validity for strings from untrusted origins. *)
  31. val validate : t -> unit
  32. (* All functions below assume string are valid UTF-8. If not,
  33. * the result is unspecified. *)
  34. (** [get s n] returns [n]-th Unicode character of [s].
  35. The call requires O(n)-time. *)
  36. val get : t -> int -> uchar
  37. (** [init len f]
  38. returns a new string which contains [len] Unicode characters.
  39. The i-th Unicode character is initialized by [f i] *)
  40. val init : int -> (int -> uchar) -> t
  41. (** [length s] returns the number of Unicode characters contained in s *)
  42. val length : t -> int
  43. (** Positions in the string represented by the number of bytes from the head.
  44. The location of the first character is [0] *)
  45. type index = int
  46. (** [nth s n] returns the position of the [n]-th Unicode character.
  47. The call requires O(n)-time *)
  48. val nth : t -> int -> index
  49. (** The position of the head of the last Unicode character. *)
  50. val last : t -> index
  51. (** [look s i]
  52. returns the Unicode character of the location [i] in the string [s]. *)
  53. val look : t -> index -> uchar
  54. (** [out_of_range s i]
  55. tests whether [i] is a position inside of [s]. *)
  56. val out_of_range : t -> index -> bool
  57. (** [compare_index s i1 i2] returns
  58. a value < 0 if [i1] is the position located before [i2],
  59. 0 if [i1] and [i2] points the same location,
  60. a value > 0 if [i1] is the position located after [i2]. *)
  61. val compare_index : t -> index -> index -> int
  62. (** [next s i]
  63. returns the position of the head of the Unicode character
  64. located immediately after [i].
  65. If [i] is inside of [s], the function always successes.
  66. If [i] is inside of [s] and there is no Unicode character after [i],
  67. the position outside [s] is returned.
  68. If [i] is not inside of [s], the behaviour is unspecified. *)
  69. val next : t -> index -> index
  70. (** [prev s i]
  71. returns the position of the head of the Unicode character
  72. located immediately before [i].
  73. If [i] is inside of [s], the function always successes.
  74. If [i] is inside of [s] and there is no Unicode character before [i],
  75. the position outside [s] is returned.
  76. If [i] is not inside of [s], the behaviour is unspecified. *)
  77. val prev : t -> index -> index
  78. (** [move s i n]
  79. returns [n]-th Unicode character after [i] if n >= 0,
  80. [n]-th Unicode character before [i] if n < 0.
  81. If there is no such character, the result is unspecified. *)
  82. val move : t -> index -> int -> index
  83. (** [iter f s]
  84. applies [f] to all Unicode characters in [s].
  85. The order of application is same to the order
  86. of the Unicode characters in [s]. *)
  87. val iter : (uchar -> unit) -> t -> unit
  88. (** Code point comparison by the lexicographic order.
  89. [compare s1 s2] returns
  90. a positive integer if [s1] > [s2],
  91. 0 if [s1] = [s2],
  92. a negative integer if [s1] < [s2]. *)
  93. val compare : t -> t -> int
  94. val add_uchar : Buffer.t -> uchar -> unit
  95. (** Buffer module for UTF-8 strings *)
  96. module Buf : sig
  97. (** Buffers for UTF-8 strings. *)
  98. type buf
  99. (** [create n] creates a buffer with the initial size [n]-bytes. *)
  100. val create : int -> buf
  101. (* The rest of functions is similar to the ones of Buffer in stdlib. *)
  102. (** [contents buf] returns the contents of the buffer. *)
  103. val contents : buf -> t
  104. (** Empty the buffer,
  105. but retains the internal storage which was holding the contents *)
  106. val clear : buf -> unit
  107. (** Empty the buffer and de-allocate the internal storage. *)
  108. val reset : buf -> unit
  109. (** Add one Unicode character to the buffer. *)
  110. val add_char : buf -> uchar -> unit
  111. (** Add the UTF-8 string to the buffer. *)
  112. val add_string : buf -> t -> unit
  113. (** [add_buffer b1 b2] adds the contents of [b2] to [b1].
  114. The contents of [b2] is not changed. *)
  115. val add_buffer : buf -> buf -> unit
  116. end