1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- # -*- coding: utf-8
- from HTMLParser import HTMLParser
- class FortuneHTMLParser(HTMLParser):
- body = []
- valid = '<!DOCTYPE html><html><head><title>Fortunes</title></head><body><table><tr><th>id</th><th>message</th></tr><tr><td>11</td><td><script>alert("This should not be displayed in a browser alert box.");</script></td></tr><tr><td>4</td><td>A bad random number generator: 1, 1, 1, 1, 1, 4.33e+67, 1, 1, 1</td></tr><tr><td>5</td><td>A computer program does what you tell it to do, not what you want it to do.</td></tr><tr><td>2</td><td>A computer scientist is someone who fixes things that aren't broken.</td></tr><tr><td>8</td><td>A list is only as strong as its weakest link. — Donald Knuth</td></tr><tr><td>0</td><td>Additional fortune added at request time.</td></tr><tr><td>3</td><td>After enough decimal places, nobody gives a damn.</td></tr><tr><td>7</td><td>Any program that runs right is obsolete.</td></tr><tr><td>10</td><td>Computers make very fast, very accurate mistakes.</td></tr><tr><td>6</td><td>Emacs is a nice operating system, but I prefer UNIX. — Tom Christaensen</td></tr><tr><td>9</td><td>Feature: A bug with seniority.</td></tr><tr><td>1</td><td>fortune: No such file or directory</td></tr><tr><td>12</td><td>フレームワークのベンチマーク</td></tr></table></body>'
- # Is called when a doctype or other such tag is read in.
- # For our purposes, we assume this is only going to be
- # "DOCTYPE html", so we will surround it with "<!" and ">".
- def handle_decl(self, decl):
- self.body.append("<!{d}>".format(d=decl))
- # This is called when an HTML character is parsed (i.e.
- # "). There are a number of issues to be resolved
- # here. For instance, some tests choose to leave the
- # "+" character as-is, which should be fine as far as
- # character escaping goes, but others choose to use the
- # character reference of "+", which is also fine.
- # Therefore, this method looks for all possible character
- # references and normalizes them so that we can
- # validate the input against a single valid spec string.
- # Another example problem: """ is valid, but so is
- # """
- def handle_charref(self, name):
- # """ is a valid escaping, but we are normalizing
- # it so that our final parse can just be checked for
- # equality.
- if name == "34" or name == "034" or name == "x22":
- # Append our normalized entity reference to our body.
- self.body.append(""")
- # "'" is a valid escaping of "-", but it is not
- # required, so we normalize for equality checking.
- if name == "39" or name == "039" or name == "x27":
- self.body.append("'")
- # Again, "+" is a valid escaping of the "+", but
- # it is not required, so we need to normalize for out
- # final parse and equality check.
- if name == "43" or name == "043" or name == "x2B":
- self.body.append("+")
- # Again, ">" is a valid escaping of ">", but we
- # need to normalize to ">" for equality checking.
- if name == "62" or name == "062" or name == "x3E":
- self.body.append(">")
- # Again, "<" is a valid escaping of "<", but we
- # need to normalize to "<" for equality checking.
- if name == "60" or name == "060" or name == "x3C":
- self.body.append("<")
- # Not sure why some are escaping '/'
- if name == "47" or name == "047" or name == "x2F":
- self.body.append("/")
- def handle_entityref(self, name):
- # Again, "—" is a valid escaping of "—", but we
- # need to normalize to "—" for equality checking.
- if name == "mdash":
- self.body.append("—")
- else:
- self.body.append("&{n};".format(n=name))
- # This is called every time a tag is opened. We append
- # each one wrapped in "<" and ">".
- def handle_starttag(self, tag, attrs):
- self.body.append("<{t}>".format(t=tag))
- # This is called whenever data is presented inside of a
- # start and end tag. Generally, this will only ever be
- # the contents inside of "<td>" and "</td>", but there
- # are also the "<title>" and "</title>" tags.
- def handle_data (self, data):
- if data.strip() != '':
- # TODO: decide whether this is worth it or not...
- # not all frameworks/libs agree on escaping
- # apostrophes, so let's just allow them for now.
- self.body.append("{d}".format(d=data.replace('\'',''')))
- # This is called every time a tag is closed. We append
- # each one wrapped in "</" and ">".
- def handle_endtag(self, tag):
- # Strictly speaking, a '</html>' is unnecessary, and
- # some tests omit it; so we will omit it in our
- # normalized version.
- if tag != "html":
- self.body.append("</{t}>".format(t=tag))
- # Returns whether the HTML input parsed by this parser
- # is valid against our known "fortune" spec.
- # The parsed data in 'body' is joined on empty strings
- # and checked for equality against our spec.
- def isValidFortune(self):
- return self.valid == ''.join(self.body)
|