fortune_html_parser.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # -*- coding: utf-8
  2. import re
  3. from HTMLParser import HTMLParser
  4. from difflib import unified_diff
  5. class FortuneHTMLParser(HTMLParser):
  6. body = []
  7. valid = '<!doctype html><html><head><title>Fortunes</title></head><body><table><tr><th>id</th><th>message</th></tr><tr><td>11</td><td>&lt;script&gt;alert(&quot;This should not be displayed in a browser alert box.&quot;);&lt;/script&gt;</td></tr><tr><td>4</td><td>A bad random number generator: 1, 1, 1, 1, 1, 4.33e+67, 1, 1, 1</td></tr><tr><td>5</td><td>A computer program does what you tell it to do, not what you want it to do.</td></tr><tr><td>2</td><td>A computer scientist is someone who fixes things that aren&apos;t broken.</td></tr><tr><td>8</td><td>A list is only as strong as its weakest link. — Donald Knuth</td></tr><tr><td>0</td><td>Additional fortune added at request time.</td></tr><tr><td>3</td><td>After enough decimal places, nobody gives a damn.</td></tr><tr><td>7</td><td>Any program that runs right is obsolete.</td></tr><tr><td>10</td><td>Computers make very fast, very accurate mistakes.</td></tr><tr><td>6</td><td>Emacs is a nice operating system, but I prefer UNIX. — Tom Christaensen</td></tr><tr><td>9</td><td>Feature: A bug with seniority.</td></tr><tr><td>1</td><td>fortune: No such file or directory</td></tr><tr><td>12</td><td>フレームワークのベンチマーク</td></tr></table></body></html>'
  8. # Is called when a doctype or other such tag is read in.
  9. # For our purposes, we assume this is only going to be
  10. # "DOCTYPE html", so we will surround it with "<!" and ">".
  11. def handle_decl(self, decl):
  12. # The spec says that for HTML this is case insensitive,
  13. # and since we did not specify xml compliance (where
  14. # incorrect casing would throw a syntax error), we must
  15. # allow all casings. We will lower for our normalization.
  16. self.body.append("<!{d}>".format(d=decl.lower()))
  17. # This is called when an HTML character is parsed (i.e.
  18. # &quot;). There are a number of issues to be resolved
  19. # here. For instance, some tests choose to leave the
  20. # "+" character as-is, which should be fine as far as
  21. # character escaping goes, but others choose to use the
  22. # character reference of "&#43;", which is also fine.
  23. # Therefore, this method looks for all possible character
  24. # references and normalizes them so that we can
  25. # validate the input against a single valid spec string.
  26. # Another example problem: "&quot;" is valid, but so is
  27. # "&#34;"
  28. def handle_charref(self, name):
  29. # "&#34;" is a valid escaping, but we are normalizing
  30. # it so that our final parse can just be checked for
  31. # equality.
  32. if name == "34" or name == "034" or name == "x22":
  33. # Append our normalized entity reference to our body.
  34. self.body.append("&quot;")
  35. # "&#39;" is a valid escaping of "-", but it is not
  36. # required, so we normalize for equality checking.
  37. if name == "39" or name == "039" or name == "x27":
  38. self.body.append("&apos;")
  39. # Again, "&#43;" is a valid escaping of the "+", but
  40. # it is not required, so we need to normalize for out
  41. # final parse and equality check.
  42. if name == "43" or name == "043" or name == "x2B":
  43. self.body.append("+")
  44. # Again, "&#62;" is a valid escaping of ">", but we
  45. # need to normalize to "&gt;" for equality checking.
  46. if name == "62" or name == "062" or name == "x3E":
  47. self.body.append("&gt;")
  48. # Again, "&#60;" is a valid escaping of "<", but we
  49. # need to normalize to "&lt;" for equality checking.
  50. if name == "60" or name == "060" or name == "x3C":
  51. self.body.append("&lt;")
  52. # Not sure why some are escaping '/'
  53. if name == "47" or name == "047" or name == "x2F":
  54. self.body.append("/")
  55. def handle_entityref(self, name):
  56. # Again, "&mdash;" is a valid escaping of "—", but we
  57. # need to normalize to "—" for equality checking.
  58. if name == "mdash":
  59. self.body.append("—")
  60. else:
  61. self.body.append("&{n};".format(n=name))
  62. # This is called every time a tag is opened. We append
  63. # each one wrapped in "<" and ">".
  64. def handle_starttag(self, tag, attrs):
  65. self.body.append("<{t}>".format(t=tag))
  66. # This is called whenever data is presented inside of a
  67. # start and end tag. Generally, this will only ever be
  68. # the contents inside of "<td>" and "</td>", but there
  69. # are also the "<title>" and "</title>" tags.
  70. def handle_data (self, data):
  71. if data.strip() != '':
  72. # After a LOT of debate, these are now considered
  73. # valid in data. The reason for this approach is
  74. # because a few tests use tools which determine
  75. # at compile time whether or not a string needs
  76. # a given type of html escaping, and our fortune
  77. # test has apostrophes and quotes in html data
  78. # rather than as an html attribute etc.
  79. # example:
  80. # <td>A computer scientist is someone who fixes things that aren't broken.</td>
  81. # Semanticly, that apostrophe does not NEED to
  82. # be escaped. The same is currently true for our
  83. # quotes.
  84. # In fact, in data (read: between two html tags)
  85. # even the '>' need not be replaced as long as
  86. # the '<' are all escaped.
  87. # We replace them with their escapings here in
  88. # order to have a noramlized string for equality
  89. # comparison at the end.
  90. data = data.replace('\'', '&apos;')
  91. data = data.replace('"', '&quot;')
  92. data = data.replace('>', '&gt;')
  93. self.body.append("{d}".format(d=data))
  94. # This is called every time a tag is closed. We append
  95. # each one wrapped in "</" and ">".
  96. def handle_endtag(self, tag):
  97. self.body.append("</{t}>".format(t=tag))
  98. # Returns whether the HTML input parsed by this parser
  99. # is valid against our known "fortune" spec.
  100. # The parsed data in 'body' is joined on empty strings
  101. # and checked for equality against our spec.
  102. def isValidFortune(self, out):
  103. body = ''.join(self.body)
  104. diff = self.valid == body
  105. if not diff:
  106. out.write("Fortune invalid. Diff following:\n")
  107. diff_str = ''.join(unified_diff(self.valid.split(' '), body.split(' '), fromfile='Valid', tofile='Response', n=5))
  108. #diff_str = re.sub(r'(?<![ +]) (?![ +])', '', diff_str)
  109. diff_str = re.sub(r' ', ' ', diff_str)
  110. out.write(diff_str)
  111. return diff