__init__.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. import sys
  4. from pathlib import Path
  5. from typing import Callable, Optional, List, Iterable, Union
  6. from datetime import datetime, timezone
  7. from django.db.models import QuerySet
  8. from ..config import (
  9. SAVE_ALLOWLIST_PTN,
  10. SAVE_DENYLIST_PTN,
  11. )
  12. from ..core.settings import ERROR_LOG
  13. from ..index.schema import ArchiveResult, Link
  14. from ..index.sql import write_link_to_sql_index
  15. from ..index import (
  16. load_link_details,
  17. write_link_details,
  18. )
  19. from ..util import enforce_types
  20. from ..logging_util import (
  21. log_archiving_started,
  22. log_archiving_paused,
  23. log_archiving_finished,
  24. log_link_archiving_started,
  25. log_link_archiving_finished,
  26. log_archive_method_started,
  27. log_archive_method_finished,
  28. )
  29. from ..search import write_search_index
  30. from .title import should_save_title, save_title
  31. from .favicon import should_save_favicon, save_favicon
  32. from .wget import should_save_wget, save_wget
  33. from .singlefile import should_save_singlefile, save_singlefile
  34. from .readability import should_save_readability, save_readability
  35. from .mercury import should_save_mercury, save_mercury
  36. from .htmltotext import should_save_htmltotext, save_htmltotext
  37. from .pdf import should_save_pdf, save_pdf
  38. from .screenshot import should_save_screenshot, save_screenshot
  39. from .dom import should_save_dom, save_dom
  40. from .git import should_save_git, save_git
  41. from .media import should_save_media, save_media
  42. from .archive_org import should_save_archive_dot_org, save_archive_dot_org
  43. from .headers import should_save_headers, save_headers
  44. ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
  45. SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
  46. ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
  47. def get_default_archive_methods() -> List[ArchiveMethodEntry]:
  48. return [
  49. ('favicon', should_save_favicon, save_favicon),
  50. ('headers', should_save_headers, save_headers),
  51. ('singlefile', should_save_singlefile, save_singlefile),
  52. ('pdf', should_save_pdf, save_pdf),
  53. ('screenshot', should_save_screenshot, save_screenshot),
  54. ('dom', should_save_dom, save_dom),
  55. ('wget', should_save_wget, save_wget),
  56. # keep title, readability, and htmltotext below wget and singlefile, as they depend on them
  57. ('title', should_save_title, save_title),
  58. ('readability', should_save_readability, save_readability),
  59. ('mercury', should_save_mercury, save_mercury),
  60. ('htmltotext', should_save_htmltotext, save_htmltotext),
  61. ('git', should_save_git, save_git),
  62. ('media', should_save_media, save_media),
  63. ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
  64. ]
  65. ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
  66. ('readability', 1),
  67. ('mercury', 2),
  68. ('htmltotext', 3),
  69. ('singlefile', 4),
  70. ('dom', 5),
  71. ('wget', 6)
  72. ]
  73. @enforce_types
  74. def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
  75. DEFAULT_METHODS = get_default_archive_methods()
  76. allowed_methods = {
  77. m for pat, methods in
  78. SAVE_ALLOWLIST_PTN.items()
  79. if pat.search(link.url)
  80. for m in methods
  81. } or { m[0] for m in DEFAULT_METHODS }
  82. denied_methods = {
  83. m for pat, methods in
  84. SAVE_DENYLIST_PTN.items()
  85. if pat.search(link.url)
  86. for m in methods
  87. }
  88. allowed_methods -= denied_methods
  89. return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
  90. @enforce_types
  91. def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
  92. ARCHIVE_METHODS = get_default_archive_methods()
  93. return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
  94. @enforce_types
  95. def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
  96. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  97. # TODO: Remove when the input is changed to be a snapshot. Suboptimal approach.
  98. from core.models import Snapshot, ArchiveResult
  99. try:
  100. snapshot = Snapshot.objects.get(url=link.url) # TODO: This will be unnecessary once everything is a snapshot
  101. except Snapshot.DoesNotExist:
  102. snapshot = write_link_to_sql_index(link)
  103. active_methods = get_archive_methods_for_link(link)
  104. if methods:
  105. active_methods = [
  106. method for method in active_methods
  107. if method[0] in methods
  108. ]
  109. out_dir = out_dir or Path(link.link_dir)
  110. try:
  111. is_new = not Path(out_dir).exists()
  112. if is_new:
  113. os.makedirs(out_dir)
  114. link = load_link_details(link, out_dir=out_dir)
  115. write_link_details(link, out_dir=out_dir, skip_sql_index=False)
  116. log_link_archiving_started(link, out_dir, is_new)
  117. link = link.overwrite(updated=datetime.now(timezone.utc))
  118. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  119. start_ts = datetime.now(timezone.utc)
  120. for method_name, should_run, method_function in active_methods:
  121. try:
  122. if method_name not in link.history:
  123. link.history[method_name] = []
  124. if should_run(link, out_dir, overwrite):
  125. log_archive_method_started(method_name)
  126. result = method_function(link=link, out_dir=out_dir)
  127. link.history[method_name].append(result)
  128. stats[result.status] += 1
  129. log_archive_method_finished(result)
  130. write_search_index(link=link, texts=result.index_texts)
  131. ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
  132. output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
  133. # bump the updated time on the main Snapshot here, this is critical
  134. # to be able to cache summaries of the ArchiveResults for a given
  135. # snapshot without having to load all the results from the DB each time.
  136. # (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
  137. # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
  138. snapshot.save()
  139. else:
  140. # print('{black} X {}{reset}'.format(method_name, **ANSI))
  141. stats['skipped'] += 1
  142. except Exception as e:
  143. # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
  144. # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
  145. # are fixed.
  146. """
  147. raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
  148. method_name,
  149. link.url,
  150. )) from e
  151. """
  152. # Instead, use the kludgy workaround from
  153. # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
  154. with open(ERROR_LOG, "a", encoding='utf-8') as f:
  155. command = ' '.join(sys.argv)
  156. ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
  157. f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
  158. method_name,
  159. link.url,
  160. command,
  161. ts
  162. ) + "\n" + str(e) + "\n"))
  163. #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
  164. # print(' ', stats)
  165. try:
  166. latest_title = link.history['title'][-1].output.strip()
  167. if latest_title and len(latest_title) >= len(link.title or ''):
  168. link = link.overwrite(title=latest_title)
  169. except Exception:
  170. pass
  171. write_link_details(link, out_dir=out_dir, skip_sql_index=False)
  172. log_link_archiving_finished(link, out_dir, is_new, stats, start_ts)
  173. except KeyboardInterrupt:
  174. try:
  175. write_link_details(link, out_dir=link.link_dir)
  176. except:
  177. pass
  178. raise
  179. except Exception as err:
  180. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  181. raise
  182. return link
  183. @enforce_types
  184. def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> List[Link]:
  185. if type(all_links) is QuerySet:
  186. num_links: int = all_links.count()
  187. get_link = lambda x: x.as_link()
  188. all_links = all_links.iterator()
  189. else:
  190. num_links: int = len(all_links)
  191. get_link = lambda x: x
  192. if num_links == 0:
  193. return []
  194. log_archiving_started(num_links)
  195. idx: int = 0
  196. try:
  197. for link in all_links:
  198. idx += 1
  199. to_archive = get_link(link)
  200. archive_link(to_archive, overwrite=overwrite, methods=methods, out_dir=Path(link.link_dir))
  201. except KeyboardInterrupt:
  202. log_archiving_paused(num_links, idx, link.timestamp)
  203. raise SystemExit(0)
  204. except BaseException:
  205. print()
  206. raise
  207. log_archiving_finished(num_links)
  208. return all_links