fortune_html_parser.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. # -*- coding: utf-8
  2. from HTMLParser import HTMLParser
  3. from difflib import unified_diff
  4. class FortuneHTMLParser(HTMLParser):
  5. body = []
  6. valid_fortune = '''<!doctype html><html>
  7. <head><title>Fortunes</title></head>
  8. <body><table>
  9. <tr><th>id</th><th>message</th></tr>
  10. <tr><td>11</td><td>&lt;script&gt;alert(&quot;This should not be displayed in a browser alert box.&quot;);&lt;/script&gt;</td></tr>
  11. <tr><td>4</td><td>A bad random number generator: 1, 1, 1, 1, 1, 4.33e+67, 1, 1, 1</td></tr>
  12. <tr><td>5</td><td>A computer program does what you tell it to do, not what you want it to do.</td></tr>
  13. <tr><td>2</td><td>A computer scientist is someone who fixes things that aren&apos;t broken.</td></tr>
  14. <tr><td>8</td><td>A list is only as strong as its weakest link. — Donald Knuth</td></tr>
  15. <tr><td>0</td><td>Additional fortune added at request time.</td></tr>
  16. <tr><td>3</td><td>After enough decimal places, nobody gives a damn.</td></tr>
  17. <tr><td>7</td><td>Any program that runs right is obsolete.</td></tr>
  18. <tr><td>10</td><td>Computers make very fast, very accurate mistakes.</td></tr>
  19. <tr><td>6</td><td>Emacs is a nice operating system, but I prefer UNIX. — Tom Christaensen</td></tr>
  20. <tr><td>9</td><td>Feature: A bug with seniority.</td></tr>
  21. <tr><td>1</td><td>fortune: No such file or directory</td></tr>
  22. <tr><td>12</td><td>フレームワークのベンチマーク</td></tr>
  23. </table></body></html>'''
  24. def handle_decl(self, decl):
  25. '''
  26. Is called when a doctype or other such tag is read in.
  27. For our purposes, we assume this is only going to be
  28. "DOCTYPE html", so we will surround it with "<!" and ">".
  29. '''
  30. # The spec says that for HTML this is case insensitive,
  31. # and since we did not specify xml compliance (where
  32. # incorrect casing would throw a syntax error), we must
  33. # allow all casings. We will lower for our normalization.
  34. self.body.append("<!{d}>".format(d=decl.lower()))
  35. def handle_charref(self, name):
  36. '''
  37. This is called when an HTML character is parsed (i.e.
  38. &quot;). There are a number of issues to be resolved
  39. here. For instance, some tests choose to leave the
  40. "+" character as-is, which should be fine as far as
  41. character escaping goes, but others choose to use the
  42. character reference of "&#43;", which is also fine.
  43. Therefore, this method looks for all possible character
  44. references and normalizes them so that we can
  45. validate the input against a single valid spec string.
  46. Another example problem: "&quot;" is valid, but so is
  47. "&#34;"
  48. '''
  49. val = name.lower()
  50. # "&#34;" is a valid escaping, but we are normalizing
  51. # it so that our final parse can just be checked for
  52. # equality.
  53. if val == "34" or val == "034" or val == "x22":
  54. # Append our normalized entity reference to our body.
  55. self.body.append("&quot;")
  56. # "&#39;" is a valid escaping of "-", but it is not
  57. # required, so we normalize for equality checking.
  58. if val == "39" or val == "039" or val == "x27":
  59. self.body.append("&apos;")
  60. # Again, "&#43;" is a valid escaping of the "+", but
  61. # it is not required, so we need to normalize for out
  62. # final parse and equality check.
  63. if val == "43" or val == "043" or val == "x2b":
  64. self.body.append("+")
  65. # Again, "&#62;" is a valid escaping of ">", but we
  66. # need to normalize to "&gt;" for equality checking.
  67. if val == "62" or val == "062" or val == "x3e":
  68. self.body.append("&gt;")
  69. # Again, "&#60;" is a valid escaping of "<", but we
  70. # need to normalize to "&lt;" for equality checking.
  71. if val == "60" or val == "060" or val == "x3c":
  72. self.body.append("&lt;")
  73. # Not sure why some are escaping '/'
  74. if val == "47" or val == "047" or val == "x2f":
  75. self.body.append("/")
  76. # "&#40;" is a valid escaping of "(", but
  77. # it is not required, so we need to normalize for out
  78. # final parse and equality check.
  79. if val == "40" or val == "040" or val == "x28":
  80. self.body.append("(")
  81. # "&#41;" is a valid escaping of ")", but
  82. # it is not required, so we need to normalize for out
  83. # final parse and equality check.
  84. if val == "41" or val == "041" or val == "x29":
  85. self.body.append(")")
  86. def handle_entityref(self, name):
  87. '''
  88. Again, "&mdash;" is a valid escaping of "—", but we
  89. need to normalize to "—" for equality checking.
  90. '''
  91. if name == "mdash":
  92. self.body.append("—")
  93. else:
  94. self.body.append("&{n};".format(n=name))
  95. def handle_starttag(self, tag, attrs):
  96. '''
  97. This is called every time a tag is opened. We append
  98. each one wrapped in "<" and ">".
  99. '''
  100. self.body.append("<{t}>".format(t=tag))
  101. # Append a newline after the <table> and <html>
  102. if tag.lower() == 'table' or tag.lower() == 'html':
  103. self.body.append("\n")
  104. def handle_data(self, data):
  105. '''
  106. This is called whenever data is presented inside of a
  107. start and end tag. Generally, this will only ever be
  108. the contents inside of "<td>" and "</td>", but there
  109. are also the "<title>" and "</title>" tags.
  110. '''
  111. if data.strip() != '':
  112. # After a LOT of debate, these are now considered
  113. # valid in data. The reason for this approach is
  114. # because a few tests use tools which determine
  115. # at compile time whether or not a string needs
  116. # a given type of html escaping, and our fortune
  117. # test has apostrophes and quotes in html data
  118. # rather than as an html attribute etc.
  119. # example:
  120. # <td>A computer scientist is someone who fixes things that aren't broken.</td>
  121. # Semanticly, that apostrophe does not NEED to
  122. # be escaped. The same is currently true for our
  123. # quotes.
  124. # In fact, in data (read: between two html tags)
  125. # even the '>' need not be replaced as long as
  126. # the '<' are all escaped.
  127. # We replace them with their escapings here in
  128. # order to have a noramlized string for equality
  129. # comparison at the end.
  130. data = data.replace('\'', '&apos;')
  131. data = data.replace('"', '&quot;')
  132. data = data.replace('>', '&gt;')
  133. self.body.append("{d}".format(d=data))
  134. def handle_endtag(self, tag):
  135. '''
  136. This is called every time a tag is closed. We append
  137. each one wrapped in "</" and ">".
  138. '''
  139. self.body.append("</{t}>".format(t=tag))
  140. # Append a newline after each </tr> and </head>
  141. if tag.lower() == 'tr' or tag.lower() == 'head':
  142. self.body.append("\n")
  143. def isValidFortune(self, out):
  144. '''
  145. Returns whether the HTML input parsed by this parser
  146. is valid against our known "fortune" spec.
  147. The parsed data in 'body' is joined on empty strings
  148. and checked for equality against our spec.
  149. '''
  150. body = ''.join(self.body)
  151. same = self.valid_fortune == body
  152. diff_lines = []
  153. if not same:
  154. output = "Oh no! I compared {!s}\n\n\nto.....{!s}\n".format(
  155. self.valid_fortune, body)
  156. output += "Fortune invalid. Diff following:\n"
  157. headers_left = 3
  158. for line in unified_diff(
  159. self.valid_fortune.split('\n'),
  160. body.split('\n'),
  161. fromfile='Valid',
  162. tofile='Response',
  163. n=0):
  164. diff_lines.append(line)
  165. output += line
  166. headers_left -= 1
  167. if headers_left <= 0:
  168. output += "\n"
  169. print(output)
  170. out.write(output)
  171. return (same, diff_lines)