__init__.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. __package__ = 'archivebox.extractors'
  2. import os
  3. from pathlib import Path
  4. from typing import Optional, List, Iterable, Union
  5. from django.db.models import QuerySet, Model
  6. from ..index import (
  7. load_snapshot_details,
  8. write_snapshot_details,
  9. )
  10. from ..util import enforce_types
  11. from ..logging_util import (
  12. log_archiving_started,
  13. log_archiving_paused,
  14. log_archiving_finished,
  15. log_snapshot_archiving_started,
  16. log_snapshot_archiving_finished,
  17. log_archive_method_started,
  18. log_archive_method_finished,
  19. )
  20. from ..search import write_search_index
  21. from .title import should_save_title, save_title
  22. from .favicon import should_save_favicon, save_favicon
  23. from .wget import should_save_wget, save_wget
  24. from .singlefile import should_save_singlefile, save_singlefile
  25. from .readability import should_save_readability, save_readability
  26. from .mercury import should_save_mercury, save_mercury
  27. from .pdf import should_save_pdf, save_pdf
  28. from .screenshot import should_save_screenshot, save_screenshot
  29. from .dom import should_save_dom, save_dom
  30. from .git import should_save_git, save_git
  31. from .media import should_save_media, save_media
  32. from .archive_org import should_save_archive_dot_org, save_archive_dot_org
  33. from .headers import should_save_headers, save_headers
  34. def get_default_archive_methods():
  35. return [
  36. ('title', should_save_title, save_title),
  37. ('favicon', should_save_favicon, save_favicon),
  38. ('wget', should_save_wget, save_wget),
  39. ('singlefile', should_save_singlefile, save_singlefile),
  40. ('pdf', should_save_pdf, save_pdf),
  41. ('screenshot', should_save_screenshot, save_screenshot),
  42. ('dom', should_save_dom, save_dom),
  43. ('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
  44. ('mercury', should_save_mercury, save_mercury),
  45. ('git', should_save_git, save_git),
  46. ('media', should_save_media, save_media),
  47. ('headers', should_save_headers, save_headers),
  48. ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
  49. ]
  50. ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)]
  51. @enforce_types
  52. def ignore_methods(to_ignore: List[str]):
  53. ARCHIVE_METHODS = get_default_archive_methods()
  54. methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
  55. methods = map(lambda x: x[0], methods)
  56. return list(methods)
  57. @enforce_types
  58. def archive_snapshot(snapshot: Model, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Model:
  59. """download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
  60. from core.models import ArchiveResult
  61. ARCHIVE_METHODS = get_default_archive_methods()
  62. if methods:
  63. ARCHIVE_METHODS = [
  64. method for method in ARCHIVE_METHODS
  65. if method[0] in methods
  66. ]
  67. out_dir = out_dir or Path(snapshot.snapshot_dir)
  68. try:
  69. is_new = not Path(out_dir).exists()
  70. if is_new:
  71. os.makedirs(out_dir)
  72. details = {"history": {}}
  73. write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
  74. else:
  75. details = snapshot.details #TODO: This can be retrieved from the sqlite database too.
  76. # If that makes more sense, it can be easily changed.
  77. log_snapshot_archiving_started(snapshot, out_dir, is_new)
  78. stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
  79. for method_name, should_run, method_function in ARCHIVE_METHODS:
  80. try:
  81. if method_name not in details["history"]:
  82. details["history"][method_name] = []
  83. if should_run(snapshot, out_dir, overwrite):
  84. log_archive_method_started(method_name)
  85. result = method_function(snapshot=snapshot, out_dir=out_dir)
  86. stats[result.status] += 1
  87. log_archive_method_finished(result)
  88. write_search_index(snapshot=snapshot, texts=result.index_texts)
  89. ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
  90. output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
  91. else:
  92. # print('{black} X {}{reset}'.format(method_name, **ANSI))
  93. stats['skipped'] += 1
  94. except Exception as e:
  95. raise Exception('Exception in archive_methods.save_{}(Snapshot(url={}))'.format(
  96. method_name,
  97. snapshot.url,
  98. )) from e
  99. # print(' ', stats)
  100. try:
  101. latest_title_archive_result = snapshot.archiveresult_set.filter(extractor="title")
  102. if latest_title_archive_result.count() > 0:
  103. latest_title = latest_title_archive_result.output.strip()
  104. if len(latest_title) >= len(snapshot.title or ''):
  105. snapshot.title = latest_title
  106. except Exception:
  107. pass
  108. write_snapshot_details(snapshot, out_dir=out_dir, skip_sql_index=False)
  109. log_snapshot_archiving_finished(snapshot, snapshot.snapshot_dir, is_new, stats)
  110. except KeyboardInterrupt:
  111. try:
  112. write_snapshot_details(snapshot, out_dir=snapshot.snapshot_dir)
  113. except:
  114. pass
  115. raise
  116. except Exception as err:
  117. print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
  118. raise
  119. return snapshot
  120. @enforce_types
  121. def archive_snapshots(all_snapshots: Union[QuerySet, List[Model]], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> QuerySet:
  122. all_snapshots = list(all_snapshots)
  123. num_snapshots: int = len(all_snapshots)
  124. if num_snapshots == 0:
  125. return []
  126. log_archiving_started(num_snapshots)
  127. idx: int = 0
  128. try:
  129. for snapshot in all_snapshots:
  130. idx += 1
  131. archive_snapshot(snapshot, overwrite=overwrite, methods=methods, out_dir=Path(snapshot.snapshot_dir))
  132. except KeyboardInterrupt:
  133. log_archiving_paused(num_snapshots, idx, snapshot.timestamp)
  134. raise SystemExit(0)
  135. except BaseException:
  136. print()
  137. raise
  138. log_archiving_finished(num_snapshots)
  139. return all_snapshots