__init__.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. __package__ = 'archivebox.index'
  2. import os
  3. import shutil
  4. from pathlib import Path
  5. from itertools import chain
  6. from typing import List, Tuple, Dict, Optional, Iterable
  7. from collections import OrderedDict
  8. from contextlib import contextmanager
  9. from urllib.parse import urlparse
  10. from django.db.models import QuerySet, Q, Model
  11. from ..util import (
  12. scheme,
  13. enforce_types,
  14. ExtendedEncoder,
  15. )
  16. from ..config import (
  17. ARCHIVE_DIR_NAME,
  18. SQL_INDEX_FILENAME,
  19. JSON_INDEX_FILENAME,
  20. OUTPUT_DIR,
  21. TIMEOUT,
  22. URL_BLACKLIST_PTN,
  23. stderr,
  24. OUTPUT_PERMISSIONS
  25. )
  26. from ..logging_util import (
  27. TimedProgress,
  28. log_indexing_process_started,
  29. log_indexing_process_finished,
  30. log_indexing_started,
  31. log_indexing_finished,
  32. log_parsing_finished,
  33. log_deduping_finished,
  34. )
  35. from .schema import Link, ArchiveResult
  36. from .html import (
  37. write_html_snapshot_details,
  38. )
  39. from .json import (
  40. pyjson,
  41. load_json_snapshot,
  42. write_json_snapshot_details,
  43. )
  44. from .sql import (
  45. write_sql_main_index,
  46. write_sql_snapshot_details,
  47. )
  48. from ..search import search_backend_enabled, query_search_index
  49. ### Link filtering and checking
  50. @enforce_types
  51. def merge_snapshots(a: Model, b: Model) -> Model:
  52. """deterministially merge two snapshots, favoring longer field values over shorter,
  53. and "cleaner" values over worse ones.
  54. TODO: Check if this makes sense with the new setup
  55. """
  56. return a
  57. assert a.base_url == b.base_url, f'Cannot merge two links with different URLs ({a.base_url} != {b.base_url})'
  58. # longest url wins (because a fuzzy url will always be shorter)
  59. url = a.url if len(a.url) > len(b.url) else b.url
  60. # best title based on length and quality
  61. possible_titles = [
  62. title
  63. for title in (a.title, b.title)
  64. if title and title.strip() and '://' not in title
  65. ]
  66. title = None
  67. if len(possible_titles) == 2:
  68. title = max(possible_titles, key=lambda t: len(t))
  69. elif len(possible_titles) == 1:
  70. title = possible_titles[0]
  71. # earliest valid timestamp
  72. timestamp = (
  73. a.timestamp
  74. if float(a.timestamp or 0) < float(b.timestamp or 0) else
  75. b.timestamp
  76. )
  77. # all unique, truthy tags
  78. tags_set = (
  79. set(tag.strip() for tag in (a.tags or '').split(','))
  80. | set(tag.strip() for tag in (b.tags or '').split(','))
  81. )
  82. tags = ','.join(tags_set) or None
  83. # all unique source entries
  84. sources = list(set(a.sources + b.sources))
  85. # all unique history entries for the combined archive methods
  86. all_methods = set(list(a.history.keys()) + list(a.history.keys()))
  87. history = {
  88. method: (a.history.get(method) or []) + (b.history.get(method) or [])
  89. for method in all_methods
  90. }
  91. for method in all_methods:
  92. deduped_jsons = {
  93. pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
  94. for result in history[method]
  95. }
  96. history[method] = list(reversed(sorted(
  97. (ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
  98. key=lambda result: result.start_ts,
  99. )))
  100. return Snapshot(
  101. url=url,
  102. timestamp=timestamp,
  103. title=title,
  104. tags=tags,
  105. #sources=sources,
  106. #history=history,
  107. )
  108. @enforce_types
  109. def validate_snapshots(snapshots: List[Model]) -> List[Model]:
  110. timer = TimedProgress(TIMEOUT * 4)
  111. try:
  112. snapshots = archivable_snapshots(snapshots) # remove chrome://, about:, mailto: etc.
  113. snapshots = sorted_snapshots(snapshots) # deterministically sort the links based on timestamp, url
  114. snapshots = fix_duplicate_snapshots(snapshots) # merge/dedupe duplicate timestamps & urls
  115. finally:
  116. timer.end()
  117. return list(snapshots)
  118. @enforce_types
  119. def archivable_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
  120. """remove chrome://, about:// or other schemed links that cant be archived"""
  121. for snapshot in snapshots:
  122. try:
  123. urlparse(snapshot.url)
  124. except ValueError:
  125. continue
  126. if scheme(snapshot.url) not in ('http', 'https', 'ftp'):
  127. continue
  128. if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(snapshot.url):
  129. continue
  130. yield snapshot
  131. @enforce_types
  132. def fix_duplicate_snapshots(sorted_snapshots: Iterable[Model]) -> Iterable[Model]:
  133. """
  134. ensures that all non-duplicate links have monotonically increasing timestamps
  135. TODO: Review how to do this with the new snapshots refactor
  136. """
  137. return sorted_snapshots
  138. unique_urls: OrderedDict[str, Link] = OrderedDict()
  139. for snapshot in sorted_snapshots:
  140. if snapshot.url in unique_urls:
  141. # merge with any other links that share the same url
  142. link = merge_links(unique_urls[link.url], link)
  143. unique_urls[link.url] = link
  144. return unique_urls.values()
  145. @enforce_types
  146. def sorted_snapshots(snapshots: Iterable[Model]) -> Iterable[Model]:
  147. sort_func = lambda snapshot: (snapshot.timestamp.split('.', 1)[0], snapshot.url)
  148. return sorted(snapshots, key=sort_func, reverse=True)
  149. @enforce_types
  150. def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
  151. if not resume:
  152. yield from links
  153. return
  154. for link in links:
  155. try:
  156. if float(link.timestamp) <= resume:
  157. yield link
  158. except (ValueError, TypeError):
  159. print('Resume value and all timestamp values must be valid numbers.')
  160. @enforce_types
  161. def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
  162. """resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
  163. timestamp = timestamp.split('.')[0]
  164. nonce = 0
  165. # first try 152323423 before 152323423.0
  166. if timestamp not in used_timestamps:
  167. return timestamp
  168. new_timestamp = '{}.{}'.format(timestamp, nonce)
  169. while new_timestamp in used_timestamps:
  170. nonce += 1
  171. new_timestamp = '{}.{}'.format(timestamp, nonce)
  172. return new_timestamp
  173. ### Main Links Index
  174. @contextmanager
  175. @enforce_types
  176. def timed_index_update(out_path: Path):
  177. log_indexing_started(out_path)
  178. timer = TimedProgress(TIMEOUT * 2, prefix=' ')
  179. try:
  180. yield
  181. finally:
  182. timer.end()
  183. assert out_path.exists(), f'Failed to write index file: {out_path}'
  184. log_indexing_finished(out_path)
  185. @enforce_types
  186. def write_main_index(snapshots: List[Model], out_dir: Path=OUTPUT_DIR) -> None:
  187. """Writes links to sqlite3 file for a given list of links"""
  188. log_indexing_process_started(len(snapshots))
  189. try:
  190. with timed_index_update(out_dir / SQL_INDEX_FILENAME):
  191. write_sql_main_index(snapshots, out_dir=out_dir)
  192. os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
  193. except (KeyboardInterrupt, SystemExit):
  194. stderr('[!] Warning: Still writing index to disk...', color='lightyellow')
  195. stderr(' Run archivebox init to fix any inconsistencies from an ungraceful exit.')
  196. with timed_index_update(out_dir / SQL_INDEX_FILENAME):
  197. write_sql_main_index(links, out_dir=out_dir)
  198. os.chmod(out_dir / SQL_INDEX_FILENAME, int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
  199. raise SystemExit(0)
  200. log_indexing_process_finished()
  201. @enforce_types
  202. def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
  203. """
  204. Returns all of the snapshots currently in index
  205. """
  206. from core.models import Snapshot
  207. try:
  208. return Snapshot.objects.all()
  209. except (KeyboardInterrupt, SystemExit):
  210. raise SystemExit(0)
  211. @enforce_types
  212. def load_main_index_meta(out_dir: Path=OUTPUT_DIR) -> Optional[dict]:
  213. index_path = out_dir / JSON_INDEX_FILENAME
  214. if index_path.exists():
  215. with open(index_path, 'r', encoding='utf-8') as f:
  216. meta_dict = pyjson.load(f)
  217. meta_dict.pop('links')
  218. return meta_dict
  219. return None
  220. @enforce_types
  221. def parse_snapshots_from_source(source_path: str, root_url: Optional[str]=None) -> List[Model]:
  222. from ..parsers import parse_snapshots
  223. new_links: List[Model] = []
  224. # parse and validate the import file
  225. raw_snapshots, parser_name = parse_snapshots(source_path, root_url=root_url)
  226. new_snapshots = validate_snapshots(raw_snapshots)
  227. if parser_name:
  228. num_parsed = len(raw_snapshots)
  229. log_parsing_finished(num_parsed, parser_name)
  230. return new_snapshots
  231. @enforce_types
  232. def filter_new_urls(snapshots: QuerySet,
  233. new_snapshots: List) -> List:
  234. """
  235. Returns a list of Snapshots corresponding to the urls that were not present in the index
  236. """
  237. urls = {snapshot.url: snapshot for snapshot in new_snapshots}
  238. filtered_snapshots = snapshots.filter(url__in=urls.keys())
  239. for found_snapshot in filtered_snapshots:
  240. urls.pop(found_snapshot.url)
  241. log_deduping_finished(len(urls.keys()))
  242. return list(urls.values())
  243. ### Link Details Index
  244. @enforce_types
  245. def write_snapshot_details(snapshot: List[Model], out_dir: Optional[str]=None, skip_sql_index: bool=False) -> None:
  246. out_dir = out_dir or snapshot.snapshot_dir
  247. write_json_snapshot_details(snapshot, out_dir=out_dir)
  248. write_html_snapshot_details(snapshot, out_dir=out_dir)
  249. if not skip_sql_index:
  250. write_sql_snapshot_details(snapshot)
  251. @enforce_types
  252. def load_snapshot_details(snapshot: Model, out_dir: Optional[str]=None) -> Model:
  253. """check for an existing link archive in the given directory,
  254. and load+merge it into the given link dict
  255. """
  256. out_dir = out_dir or Path(snapshot.snapshot_dir)
  257. existing_snapshot = load_json_snapshot(Path(out_dir))
  258. if existing_snapshot:
  259. return merge_snapshots(existing_snapshot, snapshot)
  260. return snapshot
  261. LINK_FILTERS = {
  262. 'exact': lambda pattern: Q(url=pattern),
  263. 'substring': lambda pattern: Q(url__icontains=pattern),
  264. 'regex': lambda pattern: Q(url__iregex=pattern),
  265. 'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
  266. 'tag': lambda pattern: Q(tags__name=pattern),
  267. }
  268. @enforce_types
  269. def q_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
  270. q_filter = Q()
  271. for pattern in filter_patterns:
  272. try:
  273. q_filter = q_filter | LINK_FILTERS[filter_type](pattern)
  274. except KeyError:
  275. stderr()
  276. stderr(
  277. f'[X] Got invalid pattern for --filter-type={filter_type}:',
  278. color='red',
  279. )
  280. stderr(f' {pattern}')
  281. raise SystemExit(2)
  282. return snapshots.filter(q_filter)
  283. def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='search') -> QuerySet:
  284. if not search_backend_enabled():
  285. stderr()
  286. stderr(
  287. '[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True',
  288. color='red',
  289. )
  290. raise SystemExit(2)
  291. from core.models import Snapshot
  292. qsearch = Snapshot.objects.none()
  293. for pattern in filter_patterns:
  294. try:
  295. qsearch |= query_search_index(pattern)
  296. except:
  297. raise SystemExit(2)
  298. return snapshots & qsearch
  299. @enforce_types
  300. def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type: str='exact') -> QuerySet:
  301. if filter_type != 'search':
  302. return q_filter(snapshots, filter_patterns, filter_type)
  303. else:
  304. return search_filter(snapshots, filter_patterns, filter_type)
  305. def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  306. """indexed links without checking archive status or data directory validity"""
  307. return {snapshot.snapshot_dir: snapshot for snapshot in snapshots}
  308. def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  309. """indexed links that are archived with a valid data directory"""
  310. return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_archived, snapshots)}
  311. def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  312. """indexed links that are unarchived with no data directory or an empty data directory"""
  313. return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_unarchived, snapshots)}
  314. def get_present_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  315. """dirs that actually exist in the archive/ folder"""
  316. from core.models import Snapshot
  317. all_folders = {}
  318. for entry in (out_dir / ARCHIVE_DIR_NAME).iterdir():
  319. if entry.is_dir():
  320. snapshot = None
  321. try:
  322. snapshot = load_json_snapshot(Path(entry.path))
  323. except Exception:
  324. pass
  325. all_folders[entry.name] = snapshot
  326. return all_folders
  327. def get_valid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  328. """dirs with a valid index matched to the main index and archived content"""
  329. return {snapshot.snapshot_dir: snapshot for snapshot in filter(is_valid, snapshots)}
  330. def get_invalid_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  331. """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
  332. duplicate = get_duplicate_folders(snapshots, out_dir=OUTPUT_DIR)
  333. orphaned = get_orphaned_folders(snapshots, out_dir=OUTPUT_DIR)
  334. corrupted = get_corrupted_folders(snapshots, out_dir=OUTPUT_DIR)
  335. unrecognized = get_unrecognized_folders(snapshots, out_dir=OUTPUT_DIR)
  336. return {**duplicate, **orphaned, **corrupted, **unrecognized}
  337. def get_duplicate_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  338. """dirs that conflict with other directories that have the same link URL or timestamp"""
  339. by_url = {}
  340. by_timestamp = {}
  341. duplicate_folders = {}
  342. data_folders = (
  343. str(entry)
  344. for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir()
  345. if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists()
  346. )
  347. for path in chain(snapshots.iterator(), data_folders):
  348. snapshot = None
  349. if type(path) is not str:
  350. path = path.snapshot_dir
  351. try:
  352. snapshot = load_json_snapshot(Path(path))
  353. except Exception:
  354. pass
  355. if snapshot:
  356. # snapshot folder has same timestamp as different link folder
  357. by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1
  358. if by_timestamp[snapshot.timestamp] > 1:
  359. duplicate_folders[path] = snapshot
  360. # link folder has same url as different link folder
  361. by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1
  362. if by_url[snapshot.url] > 1:
  363. duplicate_folders[path] = snapshot
  364. return duplicate_folders
  365. def get_orphaned_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  366. """dirs that contain a valid index but aren't listed in the main index"""
  367. orphaned_folders = {}
  368. for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
  369. if entry.is_dir():
  370. snapshot = None
  371. try:
  372. snapshot = load_json_snapshot(entry)
  373. except Exception:
  374. pass
  375. if snapshot and not snapshots.filter(timestamp=entry.name).exists():
  376. # folder is a valid link data dir with index details, but it's not in the main index
  377. orphaned_folders[str(entry)] = snapshot
  378. return orphaned_folders
  379. def get_corrupted_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  380. """dirs that don't contain a valid index and aren't listed in the main index"""
  381. corrupted = {}
  382. for snapshot in snapshots.iterator():
  383. if is_corrupt(snapshot):
  384. corrupted[snapshot.snapshot_dir] = snapshot
  385. return corrupted
  386. def get_unrecognized_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Model]]:
  387. """dirs that don't contain recognizable archive data and aren't listed in the main index"""
  388. unrecognized_folders: Dict[str, Optional[Model]] = {}
  389. for entry in (Path(out_dir) / ARCHIVE_DIR_NAME).iterdir():
  390. if entry.is_dir():
  391. index_exists = (entry / "index.json").exists()
  392. snapshot = None
  393. try:
  394. snapshot = load_json_snapshot(entry)
  395. except KeyError:
  396. # Try to fix index
  397. if index_exists:
  398. pass
  399. # TODO: Implement the `guess` bit for snapshots
  400. # try:
  401. # Last attempt to repair the detail index
  402. # link_guessed = parse_json_snapshot_details(str(entry), guess=True)
  403. # write_json_snapshot_details(link_guessed, out_dir=str(entry))
  404. # link = parse_json_link_details(str(entry))
  405. # except Exception:
  406. # pass
  407. if index_exists and snapshot is None:
  408. # index exists but it's corrupted or unparseable
  409. unrecognized_folders[str(entry)] = snapshot
  410. elif not index_exists:
  411. # link details index doesn't exist and the folder isn't in the main index
  412. timestamp = entry.name
  413. if not snapshots.filter(timestamp=timestamp).exists():
  414. unrecognized_folders[str(entry)] = snapshot
  415. return unrecognized_folders
  416. def is_valid(snapshot: Model) -> bool:
  417. dir_exists = Path(snapshot.snapshot_dir).exists()
  418. index_exists = (Path(snapshot.snapshot_dir) / "index.json").exists()
  419. if not dir_exists:
  420. # unarchived links are not included in the valid list
  421. return False
  422. if dir_exists and not index_exists:
  423. return False
  424. if dir_exists and index_exists:
  425. try:
  426. # TODO: review if the `guess` was necessary here
  427. parsed_snapshot = load_json_snapshot(snapshot.snapshot_dir)
  428. return snapshot.url == parsed_snapshot.url
  429. except Exception:
  430. pass
  431. return False
  432. def is_corrupt(snapshot: Model) -> bool:
  433. if not Path(snapshot.snapshot_dir).exists():
  434. # unarchived links are not considered corrupt
  435. return False
  436. if is_valid(snapshot):
  437. return False
  438. return True
  439. def is_archived(snapshot: Model) -> bool:
  440. return is_valid(snapshot) and snapshot.is_archived
  441. def is_unarchived(snapshot: Model) -> bool:
  442. if not Path(snapshot.snapshot_dir).exists():
  443. return True
  444. return not snapshot.is_archived
  445. def fix_invalid_folder_locations(out_dir: Path=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
  446. fixed = []
  447. cant_fix = []
  448. for entry in os.scandir(out_dir / ARCHIVE_DIR_NAME):
  449. if entry.is_dir(follow_symlinks=True):
  450. if (Path(entry.path) / 'index.json').exists():
  451. try:
  452. snapshot = load_json_snapshot(Path(entry.path))
  453. except KeyError:
  454. snapshot = None
  455. if not snapshot:
  456. continue
  457. if not entry.path.endswith(f'/{snapshot.timestamp}'):
  458. dest = out_dir / ARCHIVE_DIR_NAME / snapshot.timestamp
  459. if dest.exists():
  460. cant_fix.append(entry.path)
  461. else:
  462. shutil.move(entry.path, dest)
  463. fixed.append(dest)
  464. timestamp = entry.path.rsplit('/', 1)[-1]
  465. assert snapshot.snapshot_dir == entry.path
  466. assert snapshot.timestamp == timestamp
  467. write_json_snapshot_details(snapshot, out_dir=entry.path)
  468. return fixed, cant_fix