fortune_html_parser.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # -*- coding: utf-8
  2. from HTMLParser import HTMLParser
  3. from difflib import unified_diff
  4. class FortuneHTMLParser(HTMLParser):
  5. def __init__(self):
  6. HTMLParser.__init__(self)
  7. self.body = []
  8. valid_fortune = '''<!doctype html><html>
  9. <head><title>Fortunes</title></head>
  10. <body><table>
  11. <tr><th>id</th><th>message</th></tr>
  12. <tr><td>11</td><td>&lt;script&gt;alert(&quot;This should not be displayed in a browser alert box.&quot;);&lt;/script&gt;</td></tr>
  13. <tr><td>4</td><td>A bad random number generator: 1, 1, 1, 1, 1, 4.33e+67, 1, 1, 1</td></tr>
  14. <tr><td>5</td><td>A computer program does what you tell it to do, not what you want it to do.</td></tr>
  15. <tr><td>2</td><td>A computer scientist is someone who fixes things that aren&apos;t broken.</td></tr>
  16. <tr><td>8</td><td>A list is only as strong as its weakest link. — Donald Knuth</td></tr>
  17. <tr><td>0</td><td>Additional fortune added at request time.</td></tr>
  18. <tr><td>3</td><td>After enough decimal places, nobody gives a damn.</td></tr>
  19. <tr><td>7</td><td>Any program that runs right is obsolete.</td></tr>
  20. <tr><td>10</td><td>Computers make very fast, very accurate mistakes.</td></tr>
  21. <tr><td>6</td><td>Emacs is a nice operating system, but I prefer UNIX. — Tom Christaensen</td></tr>
  22. <tr><td>9</td><td>Feature: A bug with seniority.</td></tr>
  23. <tr><td>1</td><td>fortune: No such file or directory</td></tr>
  24. <tr><td>12</td><td>フレームワークのベンチマーク</td></tr>
  25. </table></body></html>'''
  26. def handle_decl(self, decl):
  27. '''
  28. Is called when a doctype or other such tag is read in.
  29. For our purposes, we assume this is only going to be
  30. "DOCTYPE html", so we will surround it with "<!" and ">".
  31. '''
  32. # The spec says that for HTML this is case insensitive,
  33. # and since we did not specify xml compliance (where
  34. # incorrect casing would throw a syntax error), we must
  35. # allow all casings. We will lower for our normalization.
  36. self.body.append("<!{d}>".format(d=decl.lower()))
  37. def handle_charref(self, name):
  38. '''
  39. This is called when an HTML character is parsed (i.e.
  40. &quot;). There are a number of issues to be resolved
  41. here. For instance, some tests choose to leave the
  42. "+" character as-is, which should be fine as far as
  43. character escaping goes, but others choose to use the
  44. character reference of "&#43;", which is also fine.
  45. Therefore, this method looks for all possible character
  46. references and normalizes them so that we can
  47. validate the input against a single valid spec string.
  48. Another example problem: "&quot;" is valid, but so is
  49. "&#34;"
  50. '''
  51. val = name.lower()
  52. # "&#34;" is a valid escaping, but we are normalizing
  53. # it so that our final parse can just be checked for
  54. # equality.
  55. if val == "34" or val == "034" or val == "x22":
  56. # Append our normalized entity reference to our body.
  57. self.body.append("&quot;")
  58. # "&#39;" is a valid escaping of "-", but it is not
  59. # required, so we normalize for equality checking.
  60. if val == "39" or val == "039" or val == "x27":
  61. self.body.append("&apos;")
  62. # Again, "&#43;" is a valid escaping of the "+", but
  63. # it is not required, so we need to normalize for out
  64. # final parse and equality check.
  65. if val == "43" or val == "043" or val == "x2b":
  66. self.body.append("+")
  67. # Again, "&#62;" is a valid escaping of ">", but we
  68. # need to normalize to "&gt;" for equality checking.
  69. if val == "62" or val == "062" or val == "x3e":
  70. self.body.append("&gt;")
  71. # Again, "&#60;" is a valid escaping of "<", but we
  72. # need to normalize to "&lt;" for equality checking.
  73. if val == "60" or val == "060" or val == "x3c":
  74. self.body.append("&lt;")
  75. # Not sure why some are escaping '/'
  76. if val == "47" or val == "047" or val == "x2f":
  77. self.body.append("/")
  78. # "&#40;" is a valid escaping of "(", but
  79. # it is not required, so we need to normalize for out
  80. # final parse and equality check.
  81. if val == "40" or val == "040" or val == "x28":
  82. self.body.append("(")
  83. # "&#41;" is a valid escaping of ")", but
  84. # it is not required, so we need to normalize for out
  85. # final parse and equality check.
  86. if val == "41" or val == "041" or val == "x29":
  87. self.body.append(")")
  88. def handle_entityref(self, name):
  89. '''
  90. Again, "&mdash;" is a valid escaping of "—", but we
  91. need to normalize to "—" for equality checking.
  92. '''
  93. if name == "mdash":
  94. self.body.append("—")
  95. else:
  96. self.body.append("&{n};".format(n=name))
  97. def handle_starttag(self, tag, attrs):
  98. '''
  99. This is called every time a tag is opened. We append
  100. each one wrapped in "<" and ">".
  101. '''
  102. self.body.append("<{t}>".format(t=tag))
  103. # Append a newline after the <table> and <html>
  104. if tag.lower() == 'table' or tag.lower() == 'html':
  105. self.body.append("\n")
  106. def handle_data(self, data):
  107. '''
  108. This is called whenever data is presented inside of a
  109. start and end tag. Generally, this will only ever be
  110. the contents inside of "<td>" and "</td>", but there
  111. are also the "<title>" and "</title>" tags.
  112. '''
  113. if data.strip() != '':
  114. # After a LOT of debate, these are now considered
  115. # valid in data. The reason for this approach is
  116. # because a few tests use tools which determine
  117. # at compile time whether or not a string needs
  118. # a given type of html escaping, and our fortune
  119. # test has apostrophes and quotes in html data
  120. # rather than as an html attribute etc.
  121. # example:
  122. # <td>A computer scientist is someone who fixes things that aren't broken.</td>
  123. # Semanticly, that apostrophe does not NEED to
  124. # be escaped. The same is currently true for our
  125. # quotes.
  126. # In fact, in data (read: between two html tags)
  127. # even the '>' need not be replaced as long as
  128. # the '<' are all escaped.
  129. # We replace them with their escapings here in
  130. # order to have a noramlized string for equality
  131. # comparison at the end.
  132. data = data.replace('\'', '&apos;')
  133. data = data.replace('"', '&quot;')
  134. data = data.replace('>', '&gt;')
  135. self.body.append("{d}".format(d=data))
  136. def handle_endtag(self, tag):
  137. '''
  138. This is called every time a tag is closed. We append
  139. each one wrapped in "</" and ">".
  140. '''
  141. self.body.append("</{t}>".format(t=tag))
  142. # Append a newline after each </tr> and </head>
  143. if tag.lower() == 'tr' or tag.lower() == 'head':
  144. self.body.append("\n")
  145. def isValidFortune(self, out):
  146. '''
  147. Returns whether the HTML input parsed by this parser
  148. is valid against our known "fortune" spec.
  149. The parsed data in 'body' is joined on empty strings
  150. and checked for equality against our spec.
  151. '''
  152. body = ''.join(self.body)
  153. same = self.valid_fortune == body
  154. diff_lines = []
  155. if not same:
  156. output = "Oh no! I compared {!s}\n\n\nto.....{!s}\n".format(
  157. self.valid_fortune, body)
  158. output += "Fortune invalid. Diff following:\n"
  159. headers_left = 3
  160. for line in unified_diff(
  161. self.valid_fortune.split('\n'),
  162. body.split('\n'),
  163. fromfile='Valid',
  164. tofile='Response',
  165. n=0):
  166. diff_lines.append(line)
  167. output += line
  168. headers_left -= 1
  169. if headers_left <= 0:
  170. output += "\n"
  171. print(output)
  172. out.write(output)
  173. return (same, diff_lines)