generic_html.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable, Optional
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from archivebox.misc.util import (
  7. htmldecode,
  8. enforce_types,
  9. find_all_urls,
  10. )
  11. from html.parser import HTMLParser
  12. from urllib.parse import urljoin
  13. class HrefParser(HTMLParser):
  14. def __init__(self):
  15. super().__init__()
  16. self.urls = []
  17. def handle_starttag(self, tag, attrs):
  18. if tag == "a":
  19. for attr, value in attrs:
  20. if attr == "href":
  21. self.urls.append(value)
  22. @enforce_types
  23. def parse_generic_html_export(html_file: IO[str], root_url: Optional[str]=None, **_kwargs) -> Iterable[Link]:
  24. """Parse Generic HTML for href tags and use only the url (support for title coming later)"""
  25. html_file.seek(0)
  26. for line in html_file:
  27. parser = HrefParser()
  28. # example line
  29. # <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
  30. parser.feed(line)
  31. for url in parser.urls:
  32. if root_url:
  33. url_is_absolute = (url.lower().startswith('http://') or url.lower().startswith('https://'))
  34. # url = https://abc.com => True
  35. # url = /page.php?next=https://example.com => False
  36. if not url_is_absolute: # resolve it by joining it with root_url
  37. relative_path = url
  38. url = urljoin(root_url, relative_path) # https://example.com/somepage.html + /home.html
  39. # => https://example.com/home.html
  40. # special case to handle bug around // handling, crucial for urls that contain sub-urls
  41. # e.g. https://web.archive.org/web/https://example.com
  42. if did_urljoin_misbehave(root_url, relative_path, url):
  43. url = fix_urljoin_bug(url)
  44. for archivable_url in find_all_urls(url):
  45. yield Link(
  46. url=htmldecode(archivable_url),
  47. timestamp=str(datetime.now(timezone.utc).timestamp()),
  48. title=None,
  49. tags=None,
  50. sources=[html_file.name],
  51. )
  52. KEY = 'html'
  53. NAME = 'Generic HTML'
  54. PARSER = parse_generic_html_export
  55. #### WORKAROUND CODE FOR https://github.com/python/cpython/issues/96015 ####
  56. def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
  57. """
  58. Handle urljoin edge case bug where multiple slashes get turned into a single slash:
  59. - https://github.com/python/cpython/issues/96015
  60. - https://github.com/ArchiveBox/ArchiveBox/issues/1411
  61. This workaround only fixes the most common case of a sub-URL inside an outer URL, e.g.:
  62. https://web.archive.org/web/https://example.com/some/inner/url
  63. But there are other valid URLs containing // that are not fixed by this workaround, e.g.:
  64. https://example.com/drives/C//some/file
  65. """
  66. # if relative path is actually an absolute url, cut off its own scheme so we check the path component only
  67. relative_path = relative_path.lower()
  68. if relative_path.startswith('http://') or relative_path.startswith('https://'):
  69. relative_path = relative_path.split('://', 1)[-1]
  70. # TODO: properly fix all double // getting stripped by urljoin, not just ://
  71. original_path_had_suburl = '://' in relative_path
  72. original_root_had_suburl = '://' in root_url[8:] # ignore first 8 chars because root always starts with https://
  73. final_joined_has_suburl = '://' in final_url[8:] # ignore first 8 chars because final always starts with https://
  74. urljoin_broke_suburls = (
  75. (original_root_had_suburl or original_path_had_suburl)
  76. and not final_joined_has_suburl
  77. )
  78. return urljoin_broke_suburls
  79. def fix_urljoin_bug(url: str, nesting_limit=5):
  80. """
  81. recursively replace broken suburls .../http:/... with .../http://...
  82. basically equivalent to this for 99.9% of cases:
  83. url = url.replace('/http:/', '/http://')
  84. url = url.replace('/https:/', '/https://')
  85. except this handles:
  86. other schemes besides http/https (e.g. https://example.com/link/git+ssh://github.com/example)
  87. other preceding separators besides / (e.g. https://example.com/login/?next=https://example.com/home)
  88. fixing multiple suburls recursively
  89. """
  90. input_url = url
  91. for _ in range(nesting_limit):
  92. url = re.sub(
  93. r'(?P<root>.+?)' # https://web.archive.org/web
  94. + r'(?P<separator>[-=/_&+%$#@!*\(\\])' # /
  95. + r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/' # http:/
  96. + r'(?P<suburl>[^/\\]+)', # example.com
  97. r"\1\2\3://\4",
  98. input_url,
  99. re.IGNORECASE | re.UNICODE,
  100. )
  101. if url == input_url:
  102. break # nothing left to replace, all suburls are fixed
  103. input_url = url
  104. return url
  105. # sanity check to make sure workaround code works as expected and doesnt introduce *more* bugs
  106. assert did_urljoin_misbehave('https://web.archive.org/web/https://example.com', 'abc.html', 'https://web.archive.org/web/https:/example.com/abc.html') == True
  107. assert did_urljoin_misbehave('http://example.com', 'https://web.archive.org/web/http://example.com/abc.html', 'https://web.archive.org/web/http:/example.com/abc.html') == True
  108. assert fix_urljoin_bug('https:/example.com') == 'https:/example.com' # should not modify original url's scheme, only sub-urls
  109. assert fix_urljoin_bug('https://web.archive.org/web/https:/example.com/abc.html') == 'https://web.archive.org/web/https://example.com/abc.html'
  110. assert fix_urljoin_bug('http://example.com/link/git+ssh:/github.com/example?next=ftp:/example.com') == 'http://example.com/link/git+ssh://github.com/example?next=ftp://example.com'