fortune_html_parser.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # -*- coding: utf-8
  2. import os
  3. from HTMLParser import HTMLParser
  4. from difflib import unified_diff
  5. from toolset.utils.output_helper import log
  6. class FortuneHTMLParser(HTMLParser):
  7. def __init__(self):
  8. HTMLParser.__init__(self)
  9. self.body = []
  10. valid_fortune = '''<!doctype html><html>
  11. <head><title>Fortunes</title></head>
  12. <body><table>
  13. <tr><th>id</th><th>message</th></tr>
  14. <tr><td>11</td><td>&lt;script&gt;alert(&quot;This should not be displayed in a browser alert box.&quot;);&lt;/script&gt;</td></tr>
  15. <tr><td>4</td><td>A bad random number generator: 1, 1, 1, 1, 1, 4.33e+67, 1, 1, 1</td></tr>
  16. <tr><td>5</td><td>A computer program does what you tell it to do, not what you want it to do.</td></tr>
  17. <tr><td>2</td><td>A computer scientist is someone who fixes things that aren&apos;t broken.</td></tr>
  18. <tr><td>8</td><td>A list is only as strong as its weakest link. — Donald Knuth</td></tr>
  19. <tr><td>0</td><td>Additional fortune added at request time.</td></tr>
  20. <tr><td>3</td><td>After enough decimal places, nobody gives a damn.</td></tr>
  21. <tr><td>7</td><td>Any program that runs right is obsolete.</td></tr>
  22. <tr><td>10</td><td>Computers make very fast, very accurate mistakes.</td></tr>
  23. <tr><td>6</td><td>Emacs is a nice operating system, but I prefer UNIX. — Tom Christaensen</td></tr>
  24. <tr><td>9</td><td>Feature: A bug with seniority.</td></tr>
  25. <tr><td>1</td><td>fortune: No such file or directory</td></tr>
  26. <tr><td>12</td><td>フレームワークのベンチマーク</td></tr>
  27. </table></body></html>'''
  28. def handle_decl(self, decl):
  29. '''
  30. Is called when a doctype or other such tag is read in.
  31. For our purposes, we assume this is only going to be
  32. "DOCTYPE html", so we will surround it with "<!" and ">".
  33. '''
  34. # The spec says that for HTML this is case insensitive,
  35. # and since we did not specify xml compliance (where
  36. # incorrect casing would throw a syntax error), we must
  37. # allow all casings. We will lower for our normalization.
  38. self.body.append("<!{d}>".format(d=decl.lower()))
  39. def handle_charref(self, name):
  40. '''
  41. This is called when an HTML character is parsed (i.e.
  42. &quot;). There are a number of issues to be resolved
  43. here. For instance, some tests choose to leave the
  44. "+" character as-is, which should be fine as far as
  45. character escaping goes, but others choose to use the
  46. character reference of "&#43;", which is also fine.
  47. Therefore, this method looks for all possible character
  48. references and normalizes them so that we can
  49. validate the input against a single valid spec string.
  50. Another example problem: "&quot;" is valid, but so is
  51. "&#34;"
  52. '''
  53. val = name.lower()
  54. # "&#34;" is a valid escaping, but we are normalizing
  55. # it so that our final parse can just be checked for
  56. # equality.
  57. if val == "34" or val == "034" or val == "x22":
  58. # Append our normalized entity reference to our body.
  59. self.body.append("&quot;")
  60. # "&#39;" is a valid escaping of "-", but it is not
  61. # required, so we normalize for equality checking.
  62. if val == "39" or val == "039" or val == "x27":
  63. self.body.append("&apos;")
  64. # Again, "&#43;" is a valid escaping of the "+", but
  65. # it is not required, so we need to normalize for out
  66. # final parse and equality check.
  67. if val == "43" or val == "043" or val == "x2b":
  68. self.body.append("+")
  69. # Again, "&#62;" is a valid escaping of ">", but we
  70. # need to normalize to "&gt;" for equality checking.
  71. if val == "62" or val == "062" or val == "x3e":
  72. self.body.append("&gt;")
  73. # Again, "&#60;" is a valid escaping of "<", but we
  74. # need to normalize to "&lt;" for equality checking.
  75. if val == "60" or val == "060" or val == "x3c":
  76. self.body.append("&lt;")
  77. # Not sure why some are escaping '/'
  78. if val == "47" or val == "047" or val == "x2f":
  79. self.body.append("/")
  80. # "&#40;" is a valid escaping of "(", but
  81. # it is not required, so we need to normalize for out
  82. # final parse and equality check.
  83. if val == "40" or val == "040" or val == "x28":
  84. self.body.append("(")
  85. # "&#41;" is a valid escaping of ")", but
  86. # it is not required, so we need to normalize for out
  87. # final parse and equality check.
  88. if val == "41" or val == "041" or val == "x29":
  89. self.body.append(")")
  90. def handle_entityref(self, name):
  91. '''
  92. Again, "&mdash;" is a valid escaping of "—", but we
  93. need to normalize to "—" for equality checking.
  94. '''
  95. if name == "mdash":
  96. self.body.append("—")
  97. else:
  98. self.body.append("&{n};".format(n=name))
  99. def handle_starttag(self, tag, attrs):
  100. '''
  101. This is called every time a tag is opened. We append
  102. each one wrapped in "<" and ">".
  103. '''
  104. self.body.append("<{t}>".format(t=tag))
  105. # Append a newline after the <table> and <html>
  106. if tag.lower() == 'table' or tag.lower() == 'html':
  107. self.body.append(os.linesep)
  108. def handle_data(self, data):
  109. '''
  110. This is called whenever data is presented inside of a
  111. start and end tag. Generally, this will only ever be
  112. the contents inside of "<td>" and "</td>", but there
  113. are also the "<title>" and "</title>" tags.
  114. '''
  115. if data.strip() != '':
  116. # After a LOT of debate, these are now considered
  117. # valid in data. The reason for this approach is
  118. # because a few tests use tools which determine
  119. # at compile time whether or not a string needs
  120. # a given type of html escaping, and our fortune
  121. # test has apostrophes and quotes in html data
  122. # rather than as an html attribute etc.
  123. # example:
  124. # <td>A computer scientist is someone who fixes things that aren't broken.</td>
  125. # Semanticly, that apostrophe does not NEED to
  126. # be escaped. The same is currently true for our
  127. # quotes.
  128. # In fact, in data (read: between two html tags)
  129. # even the '>' need not be replaced as long as
  130. # the '<' are all escaped.
  131. # We replace them with their escapings here in
  132. # order to have a noramlized string for equality
  133. # comparison at the end.
  134. data = data.replace('\'', '&apos;')
  135. data = data.replace('"', '&quot;')
  136. data = data.replace('>', '&gt;')
  137. self.body.append("{d}".format(d=data))
  138. def handle_endtag(self, tag):
  139. '''
  140. This is called every time a tag is closed. We append
  141. each one wrapped in "</" and ">".
  142. '''
  143. self.body.append("</{t}>".format(t=tag))
  144. # Append a newline after each </tr> and </head>
  145. if tag.lower() == 'tr' or tag.lower() == 'head':
  146. self.body.append(os.linesep)
  147. def isValidFortune(self, name, out):
  148. '''
  149. Returns whether the HTML input parsed by this parser
  150. is valid against our known "fortune" spec.
  151. The parsed data in 'body' is joined on empty strings
  152. and checked for equality against our spec.
  153. '''
  154. body = ''.join(self.body)
  155. same = self.valid_fortune == body
  156. diff_lines = []
  157. if not same:
  158. output = "Oh no! I compared {!s}".format(self.valid_fortune)
  159. output += os.linesep + os.linesep + "to" + os.linesep + os.linesep + body + os.linesep
  160. output += "Fortune invalid. Diff following:" + os.linesep
  161. headers_left = 3
  162. for line in unified_diff(
  163. self.valid_fortune.split(os.linesep),
  164. body.split(os.linesep),
  165. fromfile='Valid',
  166. tofile='Response',
  167. n=0):
  168. diff_lines.append(line)
  169. output += line
  170. headers_left -= 1
  171. if headers_left <= 0:
  172. output += os.linesep
  173. log(output, prefix="%s: " % name)
  174. return (same, diff_lines)