htmllex.c 25 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117
  1. /// @file
  2. /// @ingroup common_utils
  3. /*************************************************************************
  4. * Copyright (c) 2011 AT&T Intellectual Property
  5. * All rights reserved. This program and the accompanying materials
  6. * are made available under the terms of the Eclipse Public License v1.0
  7. * which accompanies this distribution, and is available at
  8. * https://www.eclipse.org/legal/epl-v10.html
  9. *
  10. * Contributors: Details at https://graphviz.org
  11. *************************************************************************/
  12. #include <assert.h>
  13. #include <common/render.h>
  14. #include <common/htmltable.h>
  15. #include "htmlparse.h"
  16. #include <common/htmllex.h>
  17. #include <cdt/cdt.h>
  18. #include <cgraph/gv_ctype.h>
  19. #include <cgraph/strview.h>
  20. #include <cgraph/tokenize.h>
  21. #include <limits.h>
  22. #include <stdbool.h>
  23. #include <stddef.h>
  24. #include <stdint.h>
  25. #include <util/alloc.h>
  26. #include <util/startswith.h>
  27. #include <util/strcasecmp.h>
  28. #include <util/unused.h>
  29. #ifdef HAVE_EXPAT
  30. #ifdef _WIN32
  31. // ensure that the expat functions get the correct storage class
  32. // declarations also on MinGW
  33. #define XML_USE_MSC_EXTENSIONS 1
  34. #endif
  35. #include <expat.h>
  36. #endif
  37. #ifndef XML_STATUS_ERROR
  38. #define XML_STATUS_ERROR 0
  39. #endif
  40. static unsigned long htmllineno_ctx(htmllexstate_t *ctx);
  41. /* error_context:
  42. * Print the last 2 "token"s seen.
  43. */
  44. static void error_context(htmllexstate_t *ctx)
  45. {
  46. agerr(AGPREV, "... %.*s%.*s ...\n", (int)ctx->prevtok.size,
  47. ctx->prevtok.data, (int)ctx->currtok.size, ctx->currtok.data);
  48. }
  49. /* htmlerror:
  50. * yyerror - called by yacc output
  51. */
  52. void htmlerror(htmlscan_t *scanner, const char *msg)
  53. {
  54. htmllexstate_t *ctx = &scanner->lexer;
  55. if (ctx->error)
  56. return;
  57. ctx->error = 1;
  58. agerrorf("%s in line %lu \n", msg, htmllineno(scanner));
  59. error_context(&scanner->lexer);
  60. }
  61. #ifdef HAVE_EXPAT
  62. /* lexerror:
  63. * called by lexer when unknown <..> is found.
  64. */
  65. static void lexerror(htmllexstate_t *ctx, const char *name)
  66. {
  67. ctx->tok = T_error;
  68. ctx->error = 1;
  69. agerrorf("Unknown HTML element <%s> on line %lu \n", name, htmllineno_ctx(ctx));
  70. }
  71. typedef int (*attrFn) (void *, char *);
  72. typedef int (*bcmpfn) (const void *, const void *);
  73. /* Mechanism for automatically processing attributes */
  74. typedef struct {
  75. char *name; /* attribute name */
  76. attrFn action; /* action to perform if name matches */
  77. } attr_item;
  78. #define ISIZE (sizeof(attr_item))
  79. /* icmp:
  80. * Compare an attr_item. Used in bsearch
  81. */
  82. static int icmp(const void *name, const void *item) {
  83. const attr_item *j = item;
  84. return strcasecmp(name, j->name);
  85. }
  86. static int bgcolorfn(htmldata_t * p, char *v)
  87. {
  88. p->bgcolor = strdup(v);
  89. return 0;
  90. }
  91. static int pencolorfn(htmldata_t * p, char *v)
  92. {
  93. p->pencolor = strdup(v);
  94. return 0;
  95. }
  96. static int hreffn(htmldata_t * p, char *v)
  97. {
  98. p->href = strdup(v);
  99. return 0;
  100. }
  101. static int sidesfn(htmldata_t * p, char *v)
  102. {
  103. unsigned short flags = 0;
  104. char c;
  105. while ((c = *v++)) {
  106. switch (gv_tolower(c)) {
  107. case 'l' :
  108. flags |= BORDER_LEFT;
  109. break;
  110. case 't' :
  111. flags |= BORDER_TOP;
  112. break;
  113. case 'r' :
  114. flags |= BORDER_RIGHT;
  115. break;
  116. case 'b' :
  117. flags |= BORDER_BOTTOM;
  118. break;
  119. default :
  120. agwarningf("Unrecognized character '%c' (%d) in sides attribute\n", c, c);
  121. break;
  122. }
  123. }
  124. if (flags != BORDER_MASK)
  125. p->flags |= flags;
  126. return 0;
  127. }
  128. static int titlefn(htmldata_t * p, char *v)
  129. {
  130. p->title = strdup(v);
  131. return 0;
  132. }
  133. static int portfn(htmldata_t * p, char *v)
  134. {
  135. p->port = strdup(v);
  136. return 0;
  137. }
  138. #define DELIM " ,"
  139. static int stylefn(htmldata_t * p, char *v)
  140. {
  141. int rv = 0;
  142. for (tok_t t = tok(v, DELIM); !tok_end(&t); tok_next(&t)) {
  143. strview_t tk = tok_get(&t);
  144. if (strview_case_str_eq(tk, "ROUNDED")) p->style.rounded = true;
  145. else if (strview_case_str_eq(tk, "RADIAL")) p->style.radial = true;
  146. else if (strview_case_str_eq(tk,"SOLID")) {
  147. p->style.dotted = false;
  148. p->style.dashed = false;
  149. } else if (strview_case_str_eq(tk,"INVISIBLE") ||
  150. strview_case_str_eq(tk,"INVIS")) p->style.invisible = true;
  151. else if (strview_case_str_eq(tk,"DOTTED")) p->style.dotted = true;
  152. else if (strview_case_str_eq(tk,"DASHED")) p->style.dashed = true;
  153. else {
  154. agwarningf("Illegal value %.*s for STYLE - ignored\n", (int)tk.size,
  155. tk.data);
  156. rv = 1;
  157. }
  158. }
  159. return rv;
  160. }
  161. static int targetfn(htmldata_t * p, char *v)
  162. {
  163. p->target = strdup(v);
  164. return 0;
  165. }
  166. static int idfn(htmldata_t * p, char *v)
  167. {
  168. p->id = strdup(v);
  169. return 0;
  170. }
  171. /* doInt:
  172. * Scan v for integral value. Check that
  173. * the value is >= min and <= max. Return value in ul.
  174. * String s is name of value.
  175. * Return 0 if okay; 1 otherwise.
  176. */
  177. static int doInt(char *v, char *s, int min, int max, long *ul)
  178. {
  179. int rv = 0;
  180. char *ep;
  181. long b = strtol(v, &ep, 10);
  182. if (ep == v) {
  183. agwarningf("Improper %s value %s - ignored", s, v);
  184. rv = 1;
  185. } else if (b > max) {
  186. agwarningf("%s value %s > %d - too large - ignored", s, v, max);
  187. rv = 1;
  188. } else if (b < min) {
  189. agwarningf("%s value %s < %d - too small - ignored", s, v, min);
  190. rv = 1;
  191. } else
  192. *ul = b;
  193. return rv;
  194. }
  195. static int gradientanglefn(htmldata_t * p, char *v)
  196. {
  197. long u;
  198. if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
  199. return 1;
  200. p->gradientangle = (unsigned short) u;
  201. return 0;
  202. }
  203. static int borderfn(htmldata_t * p, char *v)
  204. {
  205. long u;
  206. if (doInt(v, "BORDER", 0, UCHAR_MAX, &u))
  207. return 1;
  208. p->border = (unsigned char) u;
  209. p->flags |= BORDER_SET;
  210. return 0;
  211. }
  212. static int cellpaddingfn(htmldata_t * p, char *v)
  213. {
  214. long u;
  215. if (doInt(v, "CELLPADDING", 0, UCHAR_MAX, &u))
  216. return 1;
  217. p->pad = (unsigned char) u;
  218. p->flags |= PAD_SET;
  219. return 0;
  220. }
  221. static int cellspacingfn(htmldata_t * p, char *v)
  222. {
  223. long u;
  224. if (doInt(v, "CELLSPACING", SCHAR_MIN, SCHAR_MAX, &u))
  225. return 1;
  226. p->space = (signed char) u;
  227. p->flags |= SPACE_SET;
  228. return 0;
  229. }
  230. static int cellborderfn(htmltbl_t * p, char *v)
  231. {
  232. long u;
  233. if (doInt(v, "CELLBORDER", 0, INT8_MAX, &u))
  234. return 1;
  235. p->cellborder = (int8_t)u;
  236. return 0;
  237. }
  238. static int columnsfn(htmltbl_t * p, char *v)
  239. {
  240. if (*v != '*') {
  241. agwarningf("Unknown value %s for COLUMNS - ignored\n", v);
  242. return 1;
  243. }
  244. p->vrule = true;
  245. return 0;
  246. }
  247. static int rowsfn(htmltbl_t * p, char *v)
  248. {
  249. if (*v != '*') {
  250. agwarningf("Unknown value %s for ROWS - ignored\n", v);
  251. return 1;
  252. }
  253. p->hrule = true;
  254. return 0;
  255. }
  256. static int fixedsizefn(htmldata_t * p, char *v)
  257. {
  258. int rv = 0;
  259. if (!strcasecmp(v, "TRUE"))
  260. p->flags |= FIXED_FLAG;
  261. else if (strcasecmp(v, "FALSE")) {
  262. agwarningf("Illegal value %s for FIXEDSIZE - ignored\n", v);
  263. rv = 1;
  264. }
  265. return rv;
  266. }
  267. static int valignfn(htmldata_t * p, char *v)
  268. {
  269. int rv = 0;
  270. if (!strcasecmp(v, "BOTTOM"))
  271. p->flags |= VALIGN_BOTTOM;
  272. else if (!strcasecmp(v, "TOP"))
  273. p->flags |= VALIGN_TOP;
  274. else if (strcasecmp(v, "MIDDLE")) {
  275. agwarningf("Illegal value %s for VALIGN - ignored\n", v);
  276. rv = 1;
  277. }
  278. return rv;
  279. }
  280. static int halignfn(htmldata_t * p, char *v)
  281. {
  282. int rv = 0;
  283. if (!strcasecmp(v, "LEFT"))
  284. p->flags |= HALIGN_LEFT;
  285. else if (!strcasecmp(v, "RIGHT"))
  286. p->flags |= HALIGN_RIGHT;
  287. else if (strcasecmp(v, "CENTER")) {
  288. agwarningf("Illegal value %s for ALIGN - ignored\n", v);
  289. rv = 1;
  290. }
  291. return rv;
  292. }
  293. static int cell_halignfn(htmldata_t * p, char *v)
  294. {
  295. int rv = 0;
  296. if (!strcasecmp(v, "LEFT"))
  297. p->flags |= HALIGN_LEFT;
  298. else if (!strcasecmp(v, "RIGHT"))
  299. p->flags |= HALIGN_RIGHT;
  300. else if (!strcasecmp(v, "TEXT"))
  301. p->flags |= HALIGN_TEXT;
  302. else if (strcasecmp(v, "CENTER"))
  303. rv = 1;
  304. if (rv)
  305. agwarningf("Illegal value %s for ALIGN in TD - ignored\n", v);
  306. return rv;
  307. }
  308. static int balignfn(htmldata_t * p, char *v)
  309. {
  310. int rv = 0;
  311. if (!strcasecmp(v, "LEFT"))
  312. p->flags |= BALIGN_LEFT;
  313. else if (!strcasecmp(v, "RIGHT"))
  314. p->flags |= BALIGN_RIGHT;
  315. else if (strcasecmp(v, "CENTER"))
  316. rv = 1;
  317. if (rv)
  318. agwarningf("Illegal value %s for BALIGN in TD - ignored\n", v);
  319. return rv;
  320. }
  321. static int heightfn(htmldata_t * p, char *v)
  322. {
  323. long u;
  324. if (doInt(v, "HEIGHT", 0, USHRT_MAX, &u))
  325. return 1;
  326. p->height = (unsigned short) u;
  327. return 0;
  328. }
  329. static int widthfn(htmldata_t * p, char *v)
  330. {
  331. long u;
  332. if (doInt(v, "WIDTH", 0, USHRT_MAX, &u))
  333. return 1;
  334. p->width = (unsigned short) u;
  335. return 0;
  336. }
  337. static int rowspanfn(htmlcell_t * p, char *v)
  338. {
  339. long u;
  340. if (doInt(v, "ROWSPAN", 0, UINT16_MAX, &u))
  341. return 1;
  342. if (u == 0) {
  343. agwarningf("ROWSPAN value cannot be 0 - ignored\n");
  344. return 1;
  345. }
  346. p->rowspan = (uint16_t)u;
  347. return 0;
  348. }
  349. static int colspanfn(htmlcell_t * p, char *v)
  350. {
  351. long u;
  352. if (doInt(v, "COLSPAN", 0, UINT16_MAX, &u))
  353. return 1;
  354. if (u == 0) {
  355. agwarningf("COLSPAN value cannot be 0 - ignored\n");
  356. return 1;
  357. }
  358. p->colspan = (uint16_t)u;
  359. return 0;
  360. }
  361. static int fontcolorfn(textfont_t * p, char *v)
  362. {
  363. p->color = v;
  364. return 0;
  365. }
  366. static int facefn(textfont_t * p, char *v)
  367. {
  368. p->name = v;
  369. return 0;
  370. }
  371. static int ptsizefn(textfont_t * p, char *v)
  372. {
  373. long u;
  374. if (doInt(v, "POINT-SIZE", 0, UCHAR_MAX, &u))
  375. return 1;
  376. p->size = (double) u;
  377. return 0;
  378. }
  379. static int srcfn(htmlimg_t * p, char *v)
  380. {
  381. p->src = strdup(v);
  382. return 0;
  383. }
  384. static int scalefn(htmlimg_t * p, char *v)
  385. {
  386. p->scale = strdup(v);
  387. return 0;
  388. }
  389. static int alignfn(int *p, char *v)
  390. {
  391. int rv = 0;
  392. if (!strcasecmp(v, "RIGHT"))
  393. *p = 'r';
  394. else if (!strcasecmp(v, "LEFT"))
  395. *p = 'l';
  396. else if (!strcasecmp(v, "CENTER"))
  397. *p = 'n';
  398. else {
  399. agwarningf("Illegal value %s for ALIGN - ignored\n", v);
  400. rv = 1;
  401. }
  402. return rv;
  403. }
  404. /* Tables used in binary search; MUST be alphabetized */
  405. static attr_item tbl_items[] = {
  406. {"align", (attrFn) halignfn},
  407. {"bgcolor", (attrFn) bgcolorfn},
  408. {"border", (attrFn) borderfn},
  409. {"cellborder", (attrFn) cellborderfn},
  410. {"cellpadding", (attrFn) cellpaddingfn},
  411. {"cellspacing", (attrFn) cellspacingfn},
  412. {"color", (attrFn) pencolorfn},
  413. {"columns", (attrFn) columnsfn},
  414. {"fixedsize", (attrFn) fixedsizefn},
  415. {"gradientangle", (attrFn) gradientanglefn},
  416. {"height", (attrFn) heightfn},
  417. {"href", (attrFn) hreffn},
  418. {"id", (attrFn) idfn},
  419. {"port", (attrFn) portfn},
  420. {"rows", (attrFn) rowsfn},
  421. {"sides", (attrFn) sidesfn},
  422. {"style", (attrFn) stylefn},
  423. {"target", (attrFn) targetfn},
  424. {"title", (attrFn) titlefn},
  425. {"tooltip", (attrFn) titlefn},
  426. {"valign", (attrFn) valignfn},
  427. {"width", (attrFn) widthfn},
  428. };
  429. static attr_item cell_items[] = {
  430. {"align", (attrFn) cell_halignfn},
  431. {"balign", (attrFn) balignfn},
  432. {"bgcolor", (attrFn) bgcolorfn},
  433. {"border", (attrFn) borderfn},
  434. {"cellpadding", (attrFn) cellpaddingfn},
  435. {"cellspacing", (attrFn) cellspacingfn},
  436. {"color", (attrFn) pencolorfn},
  437. {"colspan", (attrFn) colspanfn},
  438. {"fixedsize", (attrFn) fixedsizefn},
  439. {"gradientangle", (attrFn) gradientanglefn},
  440. {"height", (attrFn) heightfn},
  441. {"href", (attrFn) hreffn},
  442. {"id", (attrFn) idfn},
  443. {"port", (attrFn) portfn},
  444. {"rowspan", (attrFn) rowspanfn},
  445. {"sides", (attrFn) sidesfn},
  446. {"style", (attrFn) stylefn},
  447. {"target", (attrFn) targetfn},
  448. {"title", (attrFn) titlefn},
  449. {"tooltip", (attrFn) titlefn},
  450. {"valign", (attrFn) valignfn},
  451. {"width", (attrFn) widthfn},
  452. };
  453. static attr_item font_items[] = {
  454. {"color", (attrFn) fontcolorfn},
  455. {"face", (attrFn) facefn},
  456. {"point-size", (attrFn) ptsizefn},
  457. };
  458. static attr_item img_items[] = {
  459. {"scale", (attrFn) scalefn},
  460. {"src", (attrFn) srcfn},
  461. };
  462. static attr_item br_items[] = {
  463. {"align", (attrFn) alignfn},
  464. };
  465. /* doAttrs:
  466. * General function for processing list of name/value attributes.
  467. * Do binary search on items table. If match found, invoke action
  468. * passing it tp and attribute value.
  469. * Table size is given by nel
  470. * Name/value pairs are in array atts, which is null terminated.
  471. * s is the name of the HTML element being processed.
  472. */
  473. static void doAttrs(htmllexstate_t *ctx, void *tp, attr_item *items, size_t nel, char **atts,
  474. char *s) {
  475. char *name;
  476. char *val;
  477. attr_item *ip;
  478. while ((name = *atts++) != NULL) {
  479. val = *atts++;
  480. ip = bsearch(name, items, nel, ISIZE, icmp);
  481. if (ip)
  482. ctx->warn |= ip->action(tp, val);
  483. else {
  484. agwarningf("Illegal attribute %s in %s - ignored\n", name,
  485. s);
  486. ctx->warn = 1;
  487. }
  488. }
  489. }
  490. static void mkBR(htmllexstate_t *ctx, char **atts)
  491. {
  492. ctx->htmllval->i = UNSET_ALIGN;
  493. doAttrs(ctx, &ctx->htmllval->i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
  494. }
  495. static htmlimg_t *mkImg(htmllexstate_t *ctx, char **atts)
  496. {
  497. htmlimg_t *img = gv_alloc(sizeof(htmlimg_t));
  498. doAttrs(ctx, img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
  499. return img;
  500. }
  501. static textfont_t *mkFont(htmllexstate_t *ctx, char **atts, unsigned char flags) {
  502. textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
  503. tf.size = -1.0; /* unassigned */
  504. enum { FLAGS_MAX = (1 << GV_TEXTFONT_FLAGS_WIDTH) - 1 };
  505. assert(flags <= FLAGS_MAX);
  506. tf.flags = (unsigned char)(flags & FLAGS_MAX);
  507. if (atts)
  508. doAttrs(ctx, &tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
  509. return dtinsert(ctx->gvc->textfont_dt, &tf);
  510. }
  511. static htmlcell_t *mkCell(htmllexstate_t *ctx, char **atts)
  512. {
  513. htmlcell_t *cell = gv_alloc(sizeof(htmlcell_t));
  514. cell->colspan = 1;
  515. cell->rowspan = 1;
  516. doAttrs(ctx, cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
  517. return cell;
  518. }
  519. static htmltbl_t *mkTbl(htmllexstate_t *ctx, char **atts)
  520. {
  521. htmltbl_t *tbl = gv_alloc(sizeof(htmltbl_t));
  522. tbl->row_count = SIZE_MAX; // flag that table is a raw, parsed table
  523. tbl->cellborder = -1; // unset cell border attribute
  524. doAttrs(ctx, tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
  525. return tbl;
  526. }
  527. static void startElement(void *user, const char *name, char **atts)
  528. {
  529. htmllexstate_t *ctx = user;
  530. if (strcasecmp(name, "TABLE") == 0) {
  531. ctx->htmllval->tbl = mkTbl(ctx, atts);
  532. ctx->inCell = 0;
  533. ctx->tok = T_table;
  534. } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
  535. ctx->inCell = 0;
  536. ctx->tok = T_row;
  537. } else if (strcasecmp(name, "TD") == 0) {
  538. ctx->inCell = 1;
  539. ctx->htmllval->cell = mkCell(ctx, atts);
  540. ctx->tok = T_cell;
  541. } else if (strcasecmp(name, "FONT") == 0) {
  542. ctx->htmllval->font = mkFont(ctx, atts, 0);
  543. ctx->tok = T_font;
  544. } else if (strcasecmp(name, "B") == 0) {
  545. ctx->htmllval->font = mkFont(ctx, 0, HTML_BF);
  546. ctx->tok = T_bold;
  547. } else if (strcasecmp(name, "S") == 0) {
  548. ctx->htmllval->font = mkFont(ctx, 0, HTML_S);
  549. ctx->tok = T_s;
  550. } else if (strcasecmp(name, "U") == 0) {
  551. ctx->htmllval->font = mkFont(ctx, 0, HTML_UL);
  552. ctx->tok = T_underline;
  553. } else if (strcasecmp(name, "O") == 0) {
  554. ctx->htmllval->font = mkFont(ctx, 0, HTML_OL);
  555. ctx->tok = T_overline;
  556. } else if (strcasecmp(name, "I") == 0) {
  557. ctx->htmllval->font = mkFont(ctx, 0, HTML_IF);
  558. ctx->tok = T_italic;
  559. } else if (strcasecmp(name, "SUP") == 0) {
  560. ctx->htmllval->font = mkFont(ctx, 0, HTML_SUP);
  561. ctx->tok = T_sup;
  562. } else if (strcasecmp(name, "SUB") == 0) {
  563. ctx->htmllval->font = mkFont(ctx, 0, HTML_SUB);
  564. ctx->tok = T_sub;
  565. } else if (strcasecmp(name, "BR") == 0) {
  566. mkBR(ctx, atts);
  567. ctx->tok = T_br;
  568. } else if (strcasecmp(name, "HR") == 0) {
  569. ctx->tok = T_hr;
  570. } else if (strcasecmp(name, "VR") == 0) {
  571. ctx->tok = T_vr;
  572. } else if (strcasecmp(name, "IMG") == 0) {
  573. ctx->htmllval->img = mkImg(ctx, atts);
  574. ctx->tok = T_img;
  575. } else if (strcasecmp(name, "HTML") == 0) {
  576. ctx->tok = T_html;
  577. } else {
  578. lexerror(ctx, name);
  579. }
  580. }
  581. static void endElement(void *user, const char *name)
  582. {
  583. htmllexstate_t *ctx = user;
  584. if (strcasecmp(name, "TABLE") == 0) {
  585. ctx->tok = T_end_table;
  586. ctx->inCell = 1;
  587. } else if (strcasecmp(name, "TR") == 0 || strcasecmp(name, "TH") == 0) {
  588. ctx->tok = T_end_row;
  589. } else if (strcasecmp(name, "TD") == 0) {
  590. ctx->tok = T_end_cell;
  591. ctx->inCell = 0;
  592. } else if (strcasecmp(name, "HTML") == 0) {
  593. ctx->tok = T_end_html;
  594. } else if (strcasecmp(name, "FONT") == 0) {
  595. ctx->tok = T_end_font;
  596. } else if (strcasecmp(name, "B") == 0) {
  597. ctx->tok = T_n_bold;
  598. } else if (strcasecmp(name, "U") == 0) {
  599. ctx->tok = T_n_underline;
  600. } else if (strcasecmp(name, "O") == 0) {
  601. ctx->tok = T_n_overline;
  602. } else if (strcasecmp(name, "I") == 0) {
  603. ctx->tok = T_n_italic;
  604. } else if (strcasecmp(name, "SUP") == 0) {
  605. ctx->tok = T_n_sup;
  606. } else if (strcasecmp(name, "SUB") == 0) {
  607. ctx->tok = T_n_sub;
  608. } else if (strcasecmp(name, "S") == 0) {
  609. ctx->tok = T_n_s;
  610. } else if (strcasecmp(name, "BR") == 0) {
  611. if (ctx->tok == T_br)
  612. ctx->tok = T_BR;
  613. else
  614. ctx->tok = T_end_br;
  615. } else if (strcasecmp(name, "HR") == 0) {
  616. if (ctx->tok == T_hr)
  617. ctx->tok = T_HR;
  618. else
  619. ctx->tok = T_end_hr;
  620. } else if (strcasecmp(name, "VR") == 0) {
  621. if (ctx->tok == T_vr)
  622. ctx->tok = T_VR;
  623. else
  624. ctx->tok = T_end_vr;
  625. } else if (strcasecmp(name, "IMG") == 0) {
  626. if (ctx->tok == T_img)
  627. ctx->tok = T_IMG;
  628. else
  629. ctx->tok = T_end_img;
  630. } else {
  631. lexerror(ctx, name);
  632. }
  633. }
  634. /* characterData:
  635. * Generate T_string token. Do this only when immediately in
  636. * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
  637. * Strip out formatting characters but keep spaces.
  638. * Distinguish between all whitespace vs. strings with non-whitespace
  639. * characters.
  640. */
  641. static void characterData(void *user, const char *s, int length)
  642. {
  643. htmllexstate_t *ctx = user;
  644. int i, cnt = 0;
  645. unsigned char c;
  646. if (ctx->inCell) {
  647. for (i = length; i; i--) {
  648. c = *s++;
  649. if (c >= ' ') {
  650. cnt++;
  651. agxbputc(ctx->xb, (char)c);
  652. }
  653. }
  654. if (cnt) ctx->tok = T_string;
  655. }
  656. }
  657. #endif
  658. int initHTMLlexer(htmlscan_t *scanner, char *src, agxbuf * xb, htmlenv_t *env)
  659. {
  660. #ifdef HAVE_EXPAT
  661. htmllexstate_t *ctx = &scanner->lexer;
  662. ctx->xb = xb;
  663. ctx->lb = (agxbuf){0};
  664. ctx->ptr = src;
  665. ctx->mode = 0;
  666. ctx->warn = 0;
  667. ctx->error = 0;
  668. ctx->currtok = (strview_t){0};
  669. ctx->prevtok = (strview_t){0};
  670. ctx->inCell = 1;
  671. ctx->parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
  672. ctx->gvc = GD_gvc(env->g);
  673. XML_SetUserData(ctx->parser, ctx);
  674. XML_SetElementHandler(ctx->parser,
  675. (XML_StartElementHandler) startElement,
  676. endElement);
  677. XML_SetCharacterDataHandler(ctx->parser, characterData);
  678. return 0;
  679. #else
  680. static int first;
  681. if (!first) {
  682. agwarningf(
  683. "Not built with libexpat. Table formatting is not available.\n");
  684. first++;
  685. }
  686. return 1;
  687. #endif
  688. }
  689. int clearHTMLlexer(htmlscan_t *scanner)
  690. {
  691. #ifdef HAVE_EXPAT
  692. htmllexstate_t *ctx = &scanner->lexer;
  693. int rv = ctx->error ? 3 : ctx->warn;
  694. XML_ParserFree(ctx->parser);
  695. agxbfree (&ctx->lb);
  696. return rv;
  697. #else
  698. return 1;
  699. #endif
  700. }
  701. /// \p agxbput, but assume that source and destination may overlap
  702. static UNUSED void agxbput_move(agxbuf *dst, const char *src) {
  703. // we cannot call `agxbput` itself because it calls `memcpy`, thereby
  704. // implicitly assuming that source and destination do not overlap
  705. char *src_copy = gv_strdup(src);
  706. agxbput(dst, src_copy);
  707. free(src_copy);
  708. }
  709. #ifdef HAVE_EXPAT
  710. /* eatComment:
  711. * Given first character after open comment, eat characters
  712. * up to comment close, returning pointer to closing > if it exists,
  713. * or null character otherwise.
  714. * We rely on HTML strings having matched nested <>.
  715. */
  716. static char *eatComment(htmllexstate_t *ctx, char *p)
  717. {
  718. int depth = 1;
  719. char *s = p;
  720. char c;
  721. while (depth && (c = *s++)) {
  722. if (c == '<')
  723. depth++;
  724. else if (c == '>')
  725. depth--;
  726. }
  727. s--; /* move back to '\0' or '>' */
  728. if (*s) {
  729. char *t = s - 2;
  730. if (t < p || !startswith(t, "--")) {
  731. agwarningf("Unclosed comment\n");
  732. ctx->warn = 1;
  733. }
  734. }
  735. return s;
  736. }
  737. /* findNext:
  738. * Return next XML unit. This is either <..>, an HTML
  739. * comment <!-- ... -->, or characters up to next <.
  740. */
  741. static char *findNext(htmllexstate_t *ctx, char *s, agxbuf* xb)
  742. {
  743. char* t = s + 1;
  744. char c;
  745. if (*s == '<') {
  746. if (startswith(t, "!--"))
  747. t = eatComment(ctx, t + 3);
  748. else
  749. while (*t && *t != '>')
  750. t++;
  751. if (*t != '>') {
  752. agwarningf("Label closed before end of HTML element\n");
  753. ctx->warn = 1;
  754. } else
  755. t++;
  756. } else {
  757. t = s;
  758. while ((c = *t) && c != '<') {
  759. if (c == '&' && *(t+1) != '#') {
  760. t = scanEntity(t + 1, xb);
  761. }
  762. else {
  763. agxbputc(xb, c);
  764. t++;
  765. }
  766. }
  767. }
  768. return t;
  769. }
  770. /** guard a trailing right square bracket (]) from misinterpretation
  771. *
  772. * When parsing in incremental mode, expat tries to recognize malformed CDATA
  773. * section terminators. See XML_TOK_TRAILING_RSQB in the expat source. As a
  774. * result, when seeing text that ends in a ']' expat buffers this internally and
  775. * returns truncated text for the current parse. The ']' is flushed as part of
  776. * the next parsing call when expat learns it is not a CDATA section terminator.
  777. *
  778. * To prevent this situation from occurring, this function turns any trailing
  779. * ']' into its XML escape sequence. This causes expat to realize immediately it
  780. * is not part of a CDATA section terminator and flush it in the first parsing
  781. * call. This has no effect on the final output, because expat handles the
  782. * translation back from this escape sequence to ']'.
  783. *
  784. * @param xb Buffer containing content to protect
  785. */
  786. static void protect_rsqb(agxbuf *xb) {
  787. // if the buffer is empty, we have nothing to do
  788. if (agxblen(xb) == 0) {
  789. return;
  790. }
  791. // check the last character and if it is not ], we have nothing to do
  792. char *data = agxbuse(xb);
  793. size_t size = strlen(data);
  794. assert(size > 0);
  795. if (data[size - 1] != ']') {
  796. agxbput_move(xb, data);
  797. return;
  798. }
  799. // truncate ] and write back the remaining prefix
  800. data[size - 1] = '\0';
  801. agxbput_move(xb, data);
  802. // write an XML-escaped version of ] as a replacement
  803. agxbput(xb, "&#93;");
  804. }
  805. #endif
  806. unsigned long htmllineno(htmlscan_t *scanner) {
  807. return htmllineno_ctx(&scanner->lexer);
  808. }
  809. static unsigned long htmllineno_ctx(htmllexstate_t *ctx) {
  810. #ifdef HAVE_EXPAT
  811. return XML_GetCurrentLineNumber(ctx->parser);
  812. #else
  813. return 0;
  814. #endif
  815. }
  816. #ifdef DEBUG
  817. static void printTok(htmllexstate_t *ctx, int tok)
  818. {
  819. char *s;
  820. switch (tok) {
  821. case T_end_br:
  822. s = "T_end_br";
  823. break;
  824. case T_end_img:
  825. s = "T_end_img";
  826. break;
  827. case T_row:
  828. s = "T_row";
  829. break;
  830. case T_end_row:
  831. s = "T_end_row";
  832. break;
  833. case T_html:
  834. s = "T_html";
  835. break;
  836. case T_end_html:
  837. s = "T_end_html";
  838. break;
  839. case T_end_table:
  840. s = "T_end_table";
  841. break;
  842. case T_end_cell:
  843. s = "T_end_cell";
  844. break;
  845. case T_end_font:
  846. s = "T_end_font";
  847. break;
  848. case T_string:
  849. s = "T_string";
  850. break;
  851. case T_error:
  852. s = "T_error";
  853. break;
  854. case T_n_italic:
  855. s = "T_n_italic";
  856. break;
  857. case T_n_bold:
  858. s = "T_n_bold";
  859. break;
  860. case T_n_underline:
  861. s = "T_n_underline";
  862. break;
  863. case T_n_overline:
  864. s = "T_n_overline";
  865. break;
  866. case T_n_sup:
  867. s = "T_n_sup";
  868. break;
  869. case T_n_sub:
  870. s = "T_n_sub";
  871. break;
  872. case T_n_s:
  873. s = "T_n_s";
  874. break;
  875. case T_HR:
  876. s = "T_HR";
  877. break;
  878. case T_hr:
  879. s = "T_hr";
  880. break;
  881. case T_end_hr:
  882. s = "T_end_hr";
  883. break;
  884. case T_VR:
  885. s = "T_VR";
  886. break;
  887. case T_vr:
  888. s = "T_vr";
  889. break;
  890. case T_end_vr:
  891. s = "T_end_vr";
  892. break;
  893. case T_BR:
  894. s = "T_BR";
  895. break;
  896. case T_br:
  897. s = "T_br";
  898. break;
  899. case T_IMG:
  900. s = "T_IMG";
  901. break;
  902. case T_img:
  903. s = "T_img";
  904. break;
  905. case T_table:
  906. s = "T_table";
  907. break;
  908. case T_cell:
  909. s = "T_cell";
  910. break;
  911. case T_font:
  912. s = "T_font";
  913. break;
  914. case T_italic:
  915. s = "T_italic";
  916. break;
  917. case T_bold:
  918. s = "T_bold";
  919. break;
  920. case T_underline:
  921. s = "T_underline";
  922. break;
  923. case T_overline:
  924. s = "T_overline";
  925. break;
  926. case T_sup:
  927. s = "T_sup";
  928. break;
  929. case T_sub:
  930. s = "T_sub";
  931. break;
  932. case T_s:
  933. s = "T_s";
  934. break;
  935. default:
  936. s = "<unknown>";
  937. }
  938. if (tok == T_string) {
  939. const char *token_text = agxbuse(ctx->xb);
  940. fprintf(stderr, "%s \"%s\"\n", s, token_text);
  941. agxbput_move(ctx->xb, token_text);
  942. } else
  943. fprintf(stderr, "%s\n", s);
  944. }
  945. #endif
  946. int htmllex(union HTMLSTYPE *htmllval, htmlscan_t *scanner)
  947. {
  948. #ifdef HAVE_EXPAT
  949. static char *begin_html = "<HTML>";
  950. static char *end_html = "</HTML>";
  951. char *s;
  952. char *endp = 0;
  953. size_t len, llen;
  954. int rv;
  955. htmllexstate_t *ctx = &scanner->lexer;
  956. ctx->htmllval = htmllval;
  957. ctx->tok = 0;
  958. do {
  959. if (ctx->mode == 2)
  960. return EOF;
  961. if (ctx->mode == 0) {
  962. ctx->mode = 1;
  963. s = begin_html;
  964. len = strlen(s);
  965. endp = 0;
  966. } else {
  967. s = ctx->ptr;
  968. if (*s == '\0') {
  969. ctx->mode = 2;
  970. s = end_html;
  971. len = strlen(s);
  972. } else {
  973. endp = findNext(ctx, s,&ctx->lb);
  974. len = (size_t)(endp - s);
  975. }
  976. }
  977. protect_rsqb(&ctx->lb);
  978. ctx->prevtok = ctx->currtok;
  979. ctx->currtok = (strview_t){.data = s, .size = len};
  980. if ((llen = agxblen(&ctx->lb))) {
  981. assert(llen <= (size_t)INT_MAX && "XML token too long for expat API");
  982. rv = XML_Parse(ctx->parser, agxbuse(&ctx->lb), (int)llen, 0);
  983. } else {
  984. assert(len <= (size_t)INT_MAX && "XML token too long for expat API");
  985. rv = XML_Parse(ctx->parser, s, (int)len, len ? 0 : 1);
  986. }
  987. if (rv == XML_STATUS_ERROR) {
  988. if (!ctx->error) {
  989. agerrorf("%s in line %lu \n",
  990. XML_ErrorString(XML_GetErrorCode(ctx->parser)), htmllineno(scanner));
  991. error_context(ctx);
  992. ctx->error = 1;
  993. ctx->tok = T_error;
  994. }
  995. }
  996. if (endp)
  997. ctx->ptr = endp;
  998. } while (ctx->tok == 0);
  999. #ifdef DEBUG
  1000. printTok (ctx, ctx->tok);
  1001. #endif
  1002. return ctx->tok;
  1003. #else
  1004. return EOF;
  1005. #endif
  1006. }