2
0

links.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """
  2. In ArchiveBox, a Link represents a single entry that we track in the
  3. json index. All links pass through all archiver functions and the latest,
  4. most up-to-date canonical output for each is stored in "latest".
  5. Link {
  6. timestamp: str, (how we uniquely id links) _ _ _ _ ___
  7. url: str, | \ / \ |\| ' |
  8. base_url: str, |_/ \_/ | | |
  9. domain: str, _ _ _ _ _ _
  10. tags: str, |_) /| |\| | / `
  11. type: str, | /"| | | | \_,
  12. title: str, ,-'"`-.
  13. sources: [str], /// / @ @ \ \\\\
  14. latest: { \ :=| ,._,. |=: /
  15. ..., || ,\ \_../ /. ||
  16. pdf: 'output.pdf', ||','`-._))'`.`||
  17. wget: 'example.com/1234/index.html' `-' (/ `-'
  18. },
  19. history: {
  20. ...
  21. pdf: [
  22. {timestamp: 15444234325, status: 'skipped', result='output.pdf'},
  23. ...
  24. ],
  25. wget: [
  26. {timestamp: 11534435345, status: 'succeded', result='donuts.com/eat/them.html'}
  27. ]
  28. },
  29. }
  30. """
  31. from html import unescape
  32. from collections import OrderedDict
  33. from util import (
  34. merge_links,
  35. wget_output_path,
  36. check_link_structure,
  37. check_links_structure,
  38. )
  39. def validate_links(links):
  40. check_links_structure(links)
  41. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
  42. links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
  43. links = sorted_links(links) # deterministically sort the links based on timstamp, url
  44. if not links:
  45. print('[X] No links found :(')
  46. raise SystemExit(1)
  47. for link in links:
  48. check_link_structure(link)
  49. link['title'] = unescape(link['title']) if link['title'] else None
  50. link['latest'] = link.get('latest') or {}
  51. latest = link['latest']
  52. if not link['latest'].get('wget'):
  53. link['latest']['wget'] = wget_output_path(link)
  54. if not link['latest'].get('pdf'):
  55. link['latest']['pdf'] = None
  56. if not link['latest'].get('screenshot'):
  57. link['latest']['screenshot'] = None
  58. if not link['latest'].get('dom'):
  59. link['latest']['dom'] = None
  60. if not latest.get('favicon'):
  61. latest['favicon'] = None
  62. if not link['latest'].get('title'):
  63. link['latest']['title'] = link['title']
  64. return list(links)
  65. def archivable_links(links):
  66. """remove chrome://, about:// or other schemed links that cant be archived"""
  67. return (
  68. link
  69. for link in links
  70. if any(link['url'].lower().startswith(s) for s in ('http://', 'https://', 'ftp://'))
  71. )
  72. def uniquefied_links(sorted_links):
  73. """
  74. ensures that all non-duplicate links have monotonically increasing timestamps
  75. """
  76. unique_urls = OrderedDict()
  77. lower = lambda url: url.lower().strip()
  78. without_www = lambda url: url.replace('://www.', '://', 1)
  79. without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
  80. for link in sorted_links:
  81. fuzzy_url = without_www(without_trailing_slash(lower(link['url'])))
  82. if fuzzy_url in unique_urls:
  83. # merge with any other links that share the same url
  84. link = merge_links(unique_urls[fuzzy_url], link)
  85. unique_urls[fuzzy_url] = link
  86. unique_timestamps = OrderedDict()
  87. for link in unique_urls.values():
  88. link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
  89. unique_timestamps[link['timestamp']] = link
  90. return unique_timestamps.values()
  91. def sorted_links(links):
  92. sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
  93. return sorted(links, key=sort_func, reverse=True)
  94. def links_after_timestamp(links, timestamp=None):
  95. if not timestamp:
  96. yield from links
  97. return
  98. for link in links:
  99. try:
  100. if float(link['timestamp']) <= float(timestamp):
  101. yield link
  102. except (ValueError, TypeError):
  103. print('Resume value and all timestamp values must be valid numbers.')
  104. def lowest_uniq_timestamp(used_timestamps, timestamp):
  105. """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
  106. timestamp = timestamp.split('.')[0]
  107. nonce = 0
  108. # first try 152323423 before 152323423.0
  109. if timestamp not in used_timestamps:
  110. return timestamp
  111. new_timestamp = '{}.{}'.format(timestamp, nonce)
  112. while new_timestamp in used_timestamps:
  113. nonce += 1
  114. new_timestamp = '{}.{}'.format(timestamp, nonce)
  115. return new_timestamp