pcre2_script_run.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. /*************************************************
  2. * Perl-Compatible Regular Expressions *
  3. *************************************************/
  4. /* PCRE is a library of functions to support regular expressions whose syntax
  5. and semantics are as close as possible to those of the Perl 5 language.
  6. Written by Philip Hazel
  7. Original API code Copyright (c) 1997-2012 University of Cambridge
  8. New API code Copyright (c) 2016-2021 University of Cambridge
  9. -----------------------------------------------------------------------------
  10. Redistribution and use in source and binary forms, with or without
  11. modification, are permitted provided that the following conditions are met:
  12. * Redistributions of source code must retain the above copyright notice,
  13. this list of conditions and the following disclaimer.
  14. * Redistributions in binary form must reproduce the above copyright
  15. notice, this list of conditions and the following disclaimer in the
  16. documentation and/or other materials provided with the distribution.
  17. * Neither the name of the University of Cambridge nor the names of its
  18. contributors may be used to endorse or promote products derived from
  19. this software without specific prior written permission.
  20. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21. AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  24. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25. CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27. INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29. ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30. POSSIBILITY OF SUCH DAMAGE.
  31. -----------------------------------------------------------------------------
  32. */
  33. /* This module contains the function for checking a script run. */
  34. #ifdef HAVE_CONFIG_H
  35. #include "config.h"
  36. #endif
  37. #include "pcre2_internal.h"
  38. /*************************************************
  39. * Check script run *
  40. *************************************************/
  41. /* A script run is conceptually a sequence of characters all in the same
  42. Unicode script. However, it isn't quite that simple. There are special rules
  43. for scripts that are commonly used together, and also special rules for digits.
  44. This function implements the appropriate checks, which is possible only when
  45. PCRE2 is compiled with Unicode support. The function returns TRUE if there is
  46. no Unicode support; however, it should never be called in that circumstance
  47. because an error is given by pcre2_compile() if a script run is called for in a
  48. version of PCRE2 compiled without Unicode support.
  49. Arguments:
  50. pgr point to the first character
  51. endptr point after the last character
  52. utf TRUE if in UTF mode
  53. Returns: TRUE if this is a valid script run
  54. */
  55. /* These are states in the checking process. */
  56. enum { SCRIPT_UNSET, /* Requirement as yet unknown */
  57. SCRIPT_MAP, /* Bitmap contains acceptable scripts */
  58. SCRIPT_HANPENDING, /* Have had only Han characters */
  59. SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
  60. SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
  61. SCRIPT_HANHANGUL /* Expect Han or Hangul */
  62. };
  63. #define UCD_MAPSIZE (ucp_Unknown/32 + 1)
  64. #define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
  65. BOOL
  66. PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
  67. {
  68. #ifdef SUPPORT_UNICODE
  69. uint32_t require_state = SCRIPT_UNSET;
  70. uint32_t require_map[FULL_MAPSIZE];
  71. uint32_t map[FULL_MAPSIZE];
  72. uint32_t require_digitset = 0;
  73. uint32_t c;
  74. #if PCRE2_CODE_UNIT_WIDTH == 32
  75. (void)utf; /* Avoid compiler warning */
  76. #endif
  77. /* Any string containing fewer than 2 characters is a valid script run. */
  78. if (ptr >= endptr) return TRUE;
  79. GETCHARINCTEST(c, ptr);
  80. if (ptr >= endptr) return TRUE;
  81. /* Initialize the require map. This is a full-size bitmap that has a bit for
  82. every script, as opposed to the maps in ucd_script_sets, which only have bits
  83. for scripts less than ucp_Unknown - those that appear in script extension
  84. lists. */
  85. for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
  86. /* Scan strings of two or more characters, checking the Unicode characteristics
  87. of each code point. There is special code for scripts that can be combined with
  88. characters from the Han Chinese script. This may be used in conjunction with
  89. four other scripts in these combinations:
  90. . Han with Hiragana and Katakana is allowed (for Japanese).
  91. . Han with Bopomofo is allowed (for Taiwanese Mandarin).
  92. . Han with Hangul is allowed (for Korean).
  93. If the first significant character's script is one of the four, the required
  94. script type is immediately known. However, if the first significant
  95. character's script is Han, we have to keep checking for a non-Han character.
  96. Hence the SCRIPT_HANPENDING state. */
  97. for (;;)
  98. {
  99. const ucd_record *ucd = GET_UCD(c);
  100. uint32_t script = ucd->script;
  101. /* If the script is Unknown, the string is not a valid script run. Such
  102. characters can only form script runs of length one (see test above). */
  103. if (script == ucp_Unknown) return FALSE;
  104. /* A character without any script extensions whose script is Inherited or
  105. Common is always accepted with any script. If there are extensions, the
  106. following processing happens for all scripts. */
  107. if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
  108. {
  109. BOOL OK;
  110. /* Set up a full-sized map for this character that can include bits for all
  111. scripts. Copy the scriptx map for this character (which covers those
  112. scripts that appear in script extension lists), set the remaining values to
  113. zero, and then, except for Common or Inherited, add this script's bit to
  114. the map. */
  115. memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
  116. memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
  117. if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
  118. /* Handle the different checking states */
  119. switch(require_state)
  120. {
  121. /* First significant character - it might follow Common or Inherited
  122. characters that do not have any script extensions. */
  123. case SCRIPT_UNSET:
  124. switch(script)
  125. {
  126. case ucp_Han:
  127. require_state = SCRIPT_HANPENDING;
  128. break;
  129. case ucp_Hiragana:
  130. case ucp_Katakana:
  131. require_state = SCRIPT_HANHIRAKATA;
  132. break;
  133. case ucp_Bopomofo:
  134. require_state = SCRIPT_HANBOPOMOFO;
  135. break;
  136. case ucp_Hangul:
  137. require_state = SCRIPT_HANHANGUL;
  138. break;
  139. default:
  140. memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
  141. require_state = SCRIPT_MAP;
  142. break;
  143. }
  144. break;
  145. /* The first significant character was Han. An inspection of the Unicode
  146. 11.0.0 files shows that there are the following types of Script Extension
  147. list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
  148. scripts:
  149. . Bopomofo + Han
  150. . Han + Hiragana + Katakana
  151. . Hiragana + Katakana
  152. . Bopopmofo + Hangul + Han + Hiragana + Katakana
  153. The following code tries to make sense of this. */
  154. #define FOUND_BOPOMOFO 1
  155. #define FOUND_HIRAGANA 2
  156. #define FOUND_KATAKANA 4
  157. #define FOUND_HANGUL 8
  158. case SCRIPT_HANPENDING:
  159. if (script != ucp_Han) /* Another Han does nothing */
  160. {
  161. uint32_t chspecial = 0;
  162. if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
  163. if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
  164. if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
  165. if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
  166. if (chspecial == 0) return FALSE; /* Not allowed with Han */
  167. if (chspecial == FOUND_BOPOMOFO)
  168. require_state = SCRIPT_HANBOPOMOFO;
  169. else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
  170. require_state = SCRIPT_HANHIRAKATA;
  171. /* Otherwise this character must be allowed with all of them, so remain
  172. in the pending state. */
  173. }
  174. break;
  175. /* Previously encountered one of the "with Han" scripts. Check that
  176. this character is appropriate. */
  177. case SCRIPT_HANHIRAKATA:
  178. if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
  179. MAPBIT(map, ucp_Katakana) == 0) return FALSE;
  180. break;
  181. case SCRIPT_HANBOPOMOFO:
  182. if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
  183. break;
  184. case SCRIPT_HANHANGUL:
  185. if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
  186. break;
  187. /* Previously encountered one or more characters that are allowed with a
  188. list of scripts. */
  189. case SCRIPT_MAP:
  190. OK = FALSE;
  191. for (int i = 0; i < FULL_MAPSIZE; i++)
  192. {
  193. if ((require_map[i] & map[i]) != 0)
  194. {
  195. OK = TRUE;
  196. break;
  197. }
  198. }
  199. if (!OK) return FALSE;
  200. /* The rest of the string must be in this script, but we have to
  201. allow for the Han complications. */
  202. switch(script)
  203. {
  204. case ucp_Han:
  205. require_state = SCRIPT_HANPENDING;
  206. break;
  207. case ucp_Hiragana:
  208. case ucp_Katakana:
  209. require_state = SCRIPT_HANHIRAKATA;
  210. break;
  211. case ucp_Bopomofo:
  212. require_state = SCRIPT_HANBOPOMOFO;
  213. break;
  214. case ucp_Hangul:
  215. require_state = SCRIPT_HANHANGUL;
  216. break;
  217. /* Compute the intersection of the required list of scripts and the
  218. allowed scripts for this character. */
  219. default:
  220. for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
  221. break;
  222. }
  223. break;
  224. }
  225. } /* End checking character's script and extensions. */
  226. /* The character is in an acceptable script. We must now ensure that all
  227. decimal digits in the string come from the same set. Some scripts (e.g.
  228. Common, Arabic) have more than one set of decimal digits. This code does
  229. not allow mixing sets, even within the same script. The vector called
  230. PRIV(ucd_digit_sets)[] contains, in its first element, the number of
  231. following elements, and then, in ascending order, the code points of the
  232. '9' characters in every set of 10 digits. Each set is identified by the
  233. offset in the vector of its '9' character. An initial check of the first
  234. value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
  235. if (ucd->chartype == ucp_Nd)
  236. {
  237. uint32_t digitset;
  238. if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
  239. {
  240. int mid;
  241. int bot = 1;
  242. int top = PRIV(ucd_digit_sets)[0];
  243. for (;;)
  244. {
  245. if (top <= bot + 1) /* <= rather than == is paranoia */
  246. {
  247. digitset = top;
  248. break;
  249. }
  250. mid = (top + bot) / 2;
  251. if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
  252. }
  253. }
  254. /* A required value of 0 means "unset". */
  255. if (require_digitset == 0) require_digitset = digitset;
  256. else if (digitset != require_digitset) return FALSE;
  257. } /* End digit handling */
  258. /* If we haven't yet got to the end, pick up the next character. */
  259. if (ptr >= endptr) return TRUE;
  260. GETCHARINCTEST(c, ptr);
  261. } /* End checking loop */
  262. #else /* NOT SUPPORT_UNICODE */
  263. (void)ptr;
  264. (void)endptr;
  265. (void)utf;
  266. return TRUE;
  267. #endif /* SUPPORT_UNICODE */
  268. }
  269. /* End of pcre2_script_run.c */