Multibyte.php 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. <?php
  2. /**
  3. * Lithium: the most rad php framework
  4. *
  5. * @copyright Copyright 2013, Union of RAD (http://union-of-rad.org)
  6. * @license http://opensource.org/licenses/bsd-license.php The BSD License
  7. */
  8. namespace lithium\g11n;
  9. use lithium\core\Libraries;
  10. /**
  11. * The `Multibyte` class helps operating with UTF-8 encoded strings. Here
  12. * multibyte is synonymous to UTF-8 which is probably the most widespread
  13. * multibyte encoding in recent web application development.
  14. *
  15. * Over time - as the importance of multibyte encoding support grew - a variety
  16. * of extensions appeared. While each achieves its goal somewhat differently
  17. * and might be preferred over the other, they still all do that one thing.
  18. *
  19. * What can a framework provide, those extensions aren't? It can provide
  20. * abstractions that allow portable code. While this might not be a requirement
  21. * for application code, it's a definite must for the framework's core code.
  22. *
  23. * As previously mentioned extensions appeared in a semi-evolutionary way. This
  24. * leaves us with the situation where extensions are heterogeneously spread out
  25. * over environments. There certainly is no clear winner and we're left with
  26. * the situation of "supporting them all".
  27. *
  28. * Technically this class does very little in terms of abstraction. Its main
  29. * purpose is to allow adapting to changing environments: virtually creating
  30. * something you can rely on, something that's always there while it actually
  31. * is there only in one way or the other. And - yes - some convenience methods
  32. * are also on board.
  33. */
  34. class Multibyte extends \lithium\core\Adaptable {
  35. /**
  36. * Contains adapter configurations for `Multibyte` adapters.
  37. *
  38. * @var array
  39. */
  40. protected static $_configurations = array();
  41. /**
  42. * `Libraries::locate()`-compatible path to adapters for this class.
  43. *
  44. * @see lithium\core\Libraries::locate()
  45. * @var string Dot-delimited path.
  46. */
  47. protected static $_adapters = 'adapter.g11n.multibyte';
  48. /**
  49. * Checks if a given string is UTF-8 encoded and is valid UTF-8.
  50. *
  51. * In _quick_ mode it will check only for non ASCII characters being used
  52. * indicating any multibyte encoding. Don't use quick mode for integrity
  53. * validation of UTF-8 encoded strings.
  54. *
  55. * @link http://www.w3.org/International/questions/qa-forms-utf-8.en
  56. * @param string $string The string to analyze.
  57. * @param array $options Allows to toggle mode via the `'quick'` option, defaults to `false`.
  58. * @return boolean Returns `true` if the string is UTF-8.
  59. */
  60. public static function is($string, array $options = array()) {
  61. $defaults = array('quick' => false);
  62. $options += $defaults;
  63. if ($options['quick']) {
  64. $regex = '/[^\x09\x0A\x0D\x20-\x7E]/m';
  65. } else {
  66. $regex = '/\A(';
  67. $regex .= '[\x09\x0A\x0D\x20-\x7E]'; // ASCII
  68. $regex .= '|[\xC2-\xDF][\x80-\xBF]'; // non-overlong 2-byte
  69. $regex .= '|\xE0[\xA0-\xBF][\x80-\xBF]'; // excluding overlongs
  70. $regex .= '|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}'; // straight 3-byte
  71. $regex .= '|\xED[\x80-\x9F][\x80-\xBF]'; // excluding surrogates
  72. $regex .= '|\xF0[\x90-\xBF][\x80-\xBF]{2}'; // planes 1-3
  73. $regex .= '|[\xF1-\xF3][\x80-\xBF]{3}'; // planes 4-15
  74. $regex .= '|\xF4[\x80-\x8F][\x80-\xBF]{2}'; // plane 16
  75. $regex .= ')*\z/m';
  76. }
  77. return (boolean) preg_match($regex, $string);
  78. }
  79. /**
  80. * Gets the string length. Multibyte enabled version of `strlen()`.
  81. *
  82. * @link http://php.net/manual/en/function.strlen.php
  83. * @param string $string The string being measured for length.
  84. * @param array $options Allows for selecting the adapter to use via the
  85. * `name` options. Will use the `'default'` adapter by default.
  86. * @return integer The length of the string on success.
  87. */
  88. public static function strlen($string, array $options = array()) {
  89. $defaults = array('name' => 'default');
  90. $options += $defaults;
  91. return static::adapter($options['name'])->strlen($string);
  92. }
  93. /**
  94. * Finds the position of the _first_ occurrence of a string within a string.
  95. * Multibyte enabled version of `strpos()`.
  96. *
  97. * Not all adapters must support interpreting - thus applying - passed
  98. * numeric values as ordinal values of a character.
  99. *
  100. * @link http://php.net/manual/en/function.strpos.php
  101. * @param string $haystack The string being checked.
  102. * @param string $needle The string to find in the haystack.
  103. * @param integer $offset If specified, search will start this number of
  104. * characters counted from the beginning of the string. The
  105. * offset cannot be negative.
  106. * @param array $options Allows for selecting the adapter to use via the
  107. * `name` options. Will use the `'default'` adapter by default.
  108. * @return integer Returns the numeric position of the first occurrence of
  109. * the needle in the haystack string. If needle is not found,
  110. * it returns `false`.
  111. */
  112. public static function strpos($haystack, $needle, $offset = 0, array $options = array()) {
  113. $defaults = array('name' => 'default');
  114. $options += $defaults;
  115. return static::adapter($options['name'])->strpos($haystack, $needle, $offset);
  116. }
  117. /**
  118. * Finds the position of the _last_ occurrence of a string within a string.
  119. * Multibyte enabled version of `strrpos()`.
  120. *
  121. * Not all adapters must support interpreting - thus applying - passed
  122. * numeric values as ordinal values of a character. The `Iconv` adapter
  123. * doesn't support an offset as `strpos()` does - this constitutes the
  124. * lowest common denominator here.
  125. *
  126. * @link http://php.net/manual/en/function.strrpos.php
  127. * @param string $haystack The string being checked.
  128. * @param string $needle The string to find in the haystack.
  129. * @param array $options Allows for selecting the adapter to use via the
  130. * `name` options. Will use the `'default'` adapter by default.
  131. * @return integer Returns the numeric position of the last occurrence of
  132. * the needle in the haystack string. If needle is not found,
  133. * it returns `false`.
  134. */
  135. public static function strrpos($haystack, $needle, array $options = array()) {
  136. $defaults = array('name' => 'default');
  137. $options += $defaults;
  138. return static::adapter($options['name'])->strrpos($haystack, $needle);
  139. }
  140. /**
  141. * Returns the portion of string specified by the start and length parameters.
  142. * Multibyte enabled version of `substr()`.
  143. *
  144. * @link http://php.net/manual/en/function.substr.php
  145. * @param string $string The string to extract the substring from.
  146. * @param integer $start Position of first character in string (offset).
  147. * @param integer $length Maximum numbers of characters to use from string.
  148. * @param array $options Allows for selecting the adapter to use via the
  149. * `name` options. Will use the `'default'` adapter by default.
  150. * @return string The substring extracted from given string.
  151. */
  152. public static function substr($string, $start, $length = null, array $options = array()) {
  153. $defaults = array('name' => 'default');
  154. $options += $defaults;
  155. return static::adapter($options['name'])->substr($string, $start, $length);
  156. }
  157. }
  158. ?>