xml.c 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. /**
  2. * @file
  3. * @brief @ref xml_escape
  4. * @ingroup common_utils
  5. */
  6. #include <cgraph/gv_ctype.h>
  7. #include <common/types.h>
  8. #include <common/utils.h>
  9. #include <inttypes.h>
  10. #include <stdbool.h>
  11. #include <stdint.h>
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <util/exit.h>
  15. #include <util/unreachable.h>
  16. /* return true if *s points to &[A-Za-z]+; (e.g. &Ccedil; )
  17. * or &#[0-9]*; (e.g. &#38; )
  18. * or &#x[0-9a-fA-F]*; (e.g. &#x6C34; )
  19. */
  20. static bool xml_isentity(const char *s) {
  21. s++; /* already known to be '&' */
  22. if (*s == ';') { // '&;' is not a valid entity
  23. return false;
  24. }
  25. if (*s == '#') {
  26. s++;
  27. if (*s == 'x' || *s == 'X') {
  28. s++;
  29. while (gv_isxdigit(*s))
  30. s++;
  31. } else {
  32. while (gv_isdigit(*s))
  33. s++;
  34. }
  35. } else {
  36. while (gv_isalpha(*s))
  37. s++;
  38. }
  39. if (*s == ';')
  40. return true;
  41. return false;
  42. }
  43. /** XML-escape a character
  44. *
  45. * \param previous The source character preceding the current one or '\0' if
  46. * there was no prior character.
  47. * \param[in, out] current Pointer to the current position in a source string
  48. * being escaped. The pointer is updated based on how many characters are
  49. * consumed.
  50. * \param flags Options for configuring behavior.
  51. * \param cb User function for emitting escaped data. This is expected to take a
  52. * caller-defined state type as the first parameter and the string to emit as
  53. * the second, and then return an opaque value that is passed back to the
  54. * caller.
  55. * \param state Data to pass as the first parameter when calling `cb`.
  56. * \return The return value of a call to `cb`.
  57. */
  58. static int xml_core(char previous, const char **current, xml_flags_t flags,
  59. int (*cb)(void *state, const char *s), void *state) {
  60. const char *s = *current;
  61. char c = *s;
  62. // we will consume at least one character, so note that now
  63. ++*current;
  64. // escape '&' only if not part of a legal entity sequence
  65. if (c == '&' && (flags.raw || !xml_isentity(s)))
  66. return cb(state, "&amp;");
  67. // '<' '>' are safe to substitute even if string is already XML encoded since
  68. // XML strings won’t contain '<' or '>'
  69. if (c == '<')
  70. return cb(state, "&lt;");
  71. if (c == '>')
  72. return cb(state, "&gt;");
  73. // '-' cannot be used in XML comment strings
  74. if (c == '-' && flags.dash)
  75. return cb(state, "&#45;");
  76. if (c == ' ' && previous == ' ' && flags.nbsp)
  77. // substitute 2nd and subsequent spaces with required_spaces
  78. return cb(state, "&#160;"); // Inkscape does not recognize &nbsp;
  79. if (c == '"')
  80. return cb(state, "&quot;");
  81. if (c == '\'')
  82. return cb(state, "&#39;");
  83. if (c == '\n' && flags.raw)
  84. return cb(state, "&#10;");
  85. if (c == '\r' && flags.raw)
  86. return cb(state, "&#13;");
  87. unsigned char uc = (unsigned char)c;
  88. if (uc > 0x7f && flags.utf8) {
  89. // replicating a table from https://en.wikipedia.org/wiki/UTF-8:
  90. //
  91. // ┌────────────────┬───────────────┬────────┬────────┬────────┬────────┐
  92. // │First code point│Last code point│Byte 1 │Byte 2 │Byte 3 │Byte 4 │
  93. // ├────────────────┼───────────────┼────────┼────────┼────────┼────────┤
  94. // │ U+0000│ U+007F│0xxxxxxx│ │ │ │
  95. // │ U+0080│ U+07FF│110xxxxx│10xxxxxx│ │ │
  96. // │ U+0800│ U+FFFF│1110xxxx│10xxxxxx│10xxxxxx│ │
  97. // │ U+10000│ U+10FFFF│11110xxx│10xxxxxx│10xxxxxx│10xxxxxx│
  98. // └────────────────┴───────────────┴────────┴────────┴────────┴────────┘
  99. //
  100. // from which we can calculate the byte length of the current character
  101. size_t length = (uc >> 5) == 6 ? 2
  102. : (uc >> 4) == 14 ? 3
  103. : (uc >> 3) == 30 ? 4
  104. : 0;
  105. // was the length malformed or is the follow on sequence truncated?
  106. bool is_invalid = length == 0;
  107. for (size_t l = 1; !is_invalid && length > l; ++l)
  108. is_invalid |= s[l] == '\0';
  109. // TODO: a better strategy than aborting on malformed data
  110. if (is_invalid) {
  111. fprintf(stderr, "Error during conversion to \"UTF-8\". Quiting.\n");
  112. graphviz_exit(EXIT_FAILURE);
  113. }
  114. // Decode the character. Refer again to the above table to understand this
  115. // algorithm.
  116. uint32_t utf8_char = 0;
  117. switch (length) {
  118. case 2: {
  119. uint32_t low = ((uint32_t)s[1]) & ((1 << 6) - 1);
  120. uint32_t high = ((uint32_t)s[0]) & ((1 << 5) - 1);
  121. utf8_char = low | (high << 6);
  122. break;
  123. }
  124. case 3: {
  125. uint32_t low = ((uint32_t)s[2]) & ((1 << 6) - 1);
  126. uint32_t mid = ((uint32_t)s[1]) & ((1 << 6) - 1);
  127. uint32_t high = ((uint32_t)s[0]) & ((1 << 4) - 1);
  128. utf8_char = low | (mid << 6) | (high << 12);
  129. break;
  130. }
  131. case 4: {
  132. uint32_t low = ((uint32_t)s[3]) & ((1 << 6) - 1);
  133. uint32_t mid1 = ((uint32_t)s[2]) & ((1 << 6) - 1);
  134. uint32_t mid2 = ((uint32_t)s[1]) & ((1 << 6) - 1);
  135. uint32_t high = ((uint32_t)s[0]) & ((1 << 3) - 1);
  136. utf8_char = low | (mid1 << 6) | (mid2 << 12) | (high << 18);
  137. break;
  138. }
  139. default:
  140. UNREACHABLE();
  141. }
  142. // setup a buffer that will fit the largest escape we need to print
  143. char buffer[sizeof("&#xFFFFFFFF;")];
  144. // emit the escape sequence itself
  145. snprintf(buffer, sizeof(buffer), "&#x%" PRIx32 ";", utf8_char);
  146. // note how many extra characters we consumed
  147. *current += length - 1;
  148. return cb(state, buffer);
  149. }
  150. // otherwise, output the character as-is
  151. char buffer[2] = {c, '\0'};
  152. return cb(state, buffer);
  153. }
  154. int xml_escape(const char *s, xml_flags_t flags,
  155. int (*cb)(void *state, const char *s), void *state) {
  156. char previous = '\0';
  157. int rc = 0;
  158. while (*s != '\0') {
  159. char p = *s;
  160. rc = xml_core(previous, &s, flags, cb, state);
  161. if (rc < 0)
  162. return rc;
  163. previous = p;
  164. }
  165. return rc;
  166. }
  167. #ifdef TEST_XML
  168. // compile the below test stub with:
  169. //
  170. // ${CC} -std=c99 -DTEST_XML -Ilib -Ilib/gvc -Ilib/pathplan -Ilib/cgraph
  171. // -Ilib/cdt lib/common/xml.c
  172. #include <getopt.h>
  173. static int put(void *stream, const char *s) { return fputs(s, stream); }
  174. // stub for testing above functionality
  175. int main(int argc, char **argv) {
  176. xml_flags_t flags = {0};
  177. while (true) {
  178. static const struct option opts[] = {
  179. {"dash", no_argument, 0, 'd'},
  180. {"nbsp", no_argument, 0, 'n'},
  181. {"raw", no_argument, 0, 'r'},
  182. {"utf8", no_argument, 0, 'u'},
  183. {0, 0, 0, 0},
  184. };
  185. int index;
  186. int c = getopt_long(argc, argv, "dnru", opts, &index);
  187. if (c == -1)
  188. break;
  189. switch (c) {
  190. case 'd':
  191. flags.dash = 1;
  192. break;
  193. case 'n':
  194. flags.nbsp = 1;
  195. break;
  196. case 'r':
  197. flags.raw = 1;
  198. break;
  199. case 'u':
  200. flags.utf8 = 1;
  201. break;
  202. default:
  203. fprintf(stderr, "unexpected error\n");
  204. graphviz_exit(EXIT_FAILURE);
  205. }
  206. }
  207. // escape all input we received
  208. for (int i = optind; i < argc; ++i) {
  209. int r = xml_escape(argv[i], flags, put, stdout);
  210. if (r < 0)
  211. graphviz_exit(EXIT_FAILURE);
  212. }
  213. graphviz_exit(EXIT_SUCCESS);
  214. }
  215. #endif