main.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626
  1. import os
  2. import re
  3. import shutil
  4. from typing import Dict, List, Optional, Iterable
  5. from itertools import chain
  6. from .schema import Link
  7. from .util import (
  8. enforce_types,
  9. TimedProgress,
  10. get_dir_size,
  11. human_readable_size,
  12. )
  13. from .index import (
  14. links_after_timestamp,
  15. load_main_index,
  16. import_new_links,
  17. write_main_index,
  18. )
  19. from .storage.json import (
  20. parse_json_main_index,
  21. parse_json_link_details,
  22. parse_json_links_details,
  23. )
  24. from .storage.sql import parse_sql_main_index, get_admins
  25. from .storage.html import parse_html_main_index
  26. from .archive_methods import archive_link
  27. from .config import (
  28. stderr,
  29. ANSI,
  30. ONLY_NEW,
  31. OUTPUT_DIR,
  32. SOURCES_DIR,
  33. ARCHIVE_DIR,
  34. LOGS_DIR,
  35. CONFIG_FILE,
  36. ARCHIVE_DIR_NAME,
  37. SOURCES_DIR_NAME,
  38. LOGS_DIR_NAME,
  39. STATIC_DIR_NAME,
  40. JSON_INDEX_FILENAME,
  41. HTML_INDEX_FILENAME,
  42. SQL_INDEX_FILENAME,
  43. ROBOTS_TXT_FILENAME,
  44. FAVICON_FILENAME,
  45. check_dependencies,
  46. check_data_folder,
  47. setup_django,
  48. write_config_file,
  49. )
  50. from .logs import (
  51. log_archiving_started,
  52. log_archiving_paused,
  53. log_archiving_finished,
  54. log_removal_started,
  55. log_removal_finished,
  56. log_list_started,
  57. log_list_finished,
  58. )
  59. ALLOWED_IN_OUTPUT_DIR = {
  60. '.DS_Store',
  61. '.venv',
  62. 'venv',
  63. 'virtualenv',
  64. '.virtualenv',
  65. ARCHIVE_DIR_NAME,
  66. SOURCES_DIR_NAME,
  67. LOGS_DIR_NAME,
  68. STATIC_DIR_NAME,
  69. SQL_INDEX_FILENAME,
  70. JSON_INDEX_FILENAME,
  71. HTML_INDEX_FILENAME,
  72. ROBOTS_TXT_FILENAME,
  73. FAVICON_FILENAME,
  74. }
  75. @enforce_types
  76. def init():
  77. os.makedirs(OUTPUT_DIR, exist_ok=True)
  78. is_empty = not len(set(os.listdir(OUTPUT_DIR)) - ALLOWED_IN_OUTPUT_DIR)
  79. existing_index = os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
  80. if is_empty and not existing_index:
  81. print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  82. print(f' {OUTPUT_DIR}')
  83. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  84. elif existing_index:
  85. print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
  86. print(f' {OUTPUT_DIR}')
  87. print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  88. else:
  89. stderr(
  90. ("{red}[X] This folder appears to already have files in it, but no index.json is present.{reset}\n\n"
  91. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  92. " {lightred}Hint:{reset} To import an existing data folder make sure to cd into the folder first, \n"
  93. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  94. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  95. ).format(OUTPUT_DIR, **ANSI)
  96. )
  97. raise SystemExit(1)
  98. if existing_index:
  99. print('\n{green}[*] Verifying archive folder structure...{reset}'.format(**ANSI))
  100. else:
  101. print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
  102. os.makedirs(SOURCES_DIR, exist_ok=True)
  103. print(f' √ {SOURCES_DIR}')
  104. os.makedirs(ARCHIVE_DIR, exist_ok=True)
  105. print(f' √ {ARCHIVE_DIR}')
  106. os.makedirs(LOGS_DIR, exist_ok=True)
  107. print(f' √ {LOGS_DIR}')
  108. write_config_file({}, out_dir=OUTPUT_DIR)
  109. print(f' √ {CONFIG_FILE}')
  110. if os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)):
  111. print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
  112. else:
  113. print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
  114. setup_django(OUTPUT_DIR, check_db=False)
  115. from django.conf import settings
  116. assert settings.DATABASE_FILE == os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME)
  117. print(f' √ {settings.DATABASE_FILE}')
  118. print()
  119. from .storage.sql import apply_migrations
  120. for migration_line in apply_migrations(OUTPUT_DIR):
  121. print(f' {migration_line}')
  122. assert os.path.exists(settings.DATABASE_FILE)
  123. # from django.contrib.auth.models import User
  124. # if IS_TTY and not User.objects.filter(is_superuser=True).exists():
  125. # print('{green}[+] Creating admin user account...{reset}'.format(**ANSI))
  126. # call_command("createsuperuser", interactive=True)
  127. print()
  128. print('{green}[*] Collecting links from any existing index or archive folders...{reset}'.format(**ANSI))
  129. all_links = {}
  130. if existing_index:
  131. all_links = {
  132. link.url: link
  133. for link in load_main_index(out_dir=OUTPUT_DIR, warn=False)
  134. }
  135. print(' √ Loaded {} links from existing main index...'.format(len(all_links)))
  136. orphaned_json_links = {
  137. link.url: link
  138. for link in parse_json_main_index(OUTPUT_DIR)
  139. if link.url not in all_links
  140. }
  141. if orphaned_json_links:
  142. all_links.update(orphaned_json_links)
  143. print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
  144. orphaned_sql_links = {
  145. link.url: link
  146. for link in parse_sql_main_index(OUTPUT_DIR)
  147. if link.url not in all_links
  148. }
  149. if orphaned_sql_links:
  150. all_links.update(orphaned_sql_links)
  151. print(' {lightyellow}√ Added {} orphaned links from existing SQL index...{reset}'.format(len(orphaned_sql_links), **ANSI))
  152. orphaned_data_dir_links = {
  153. link.url: link
  154. for link in parse_json_links_details(OUTPUT_DIR)
  155. }
  156. orphan_new_links = {
  157. url: link
  158. for url, link in orphaned_data_dir_links.items()
  159. if url not in all_links
  160. }
  161. orphan_duplicates = {
  162. url: link
  163. for url, link in orphaned_data_dir_links.items()
  164. if url in all_links
  165. }
  166. if orphan_new_links:
  167. all_links.update(orphan_new_links)
  168. print(' {lightyellow}√ Added {} orphaned links from existing archive directories...{reset}'.format(len(orphan_new_links), **ANSI))
  169. if orphan_duplicates:
  170. print(' {lightyellow}! Skipped adding {} invalid link data directories that would have overwritten or corrupted existing data.{reset}'.format(len(orphan_duplicates), **ANSI))
  171. orphaned_data_dirs = {folder for folder in orphan_duplicates.keys()}
  172. invalid_folders = {
  173. folder: link
  174. for folder, link in get_invalid_folders(all_links.values(), out_dir=OUTPUT_DIR).items()
  175. if folder not in orphaned_data_dirs
  176. }
  177. if invalid_folders:
  178. print(' {lightyellow}! Skipped adding {} corrupted/unrecognized link data directories that could not be read.{reset}'.format(len(orphan_duplicates), **ANSI))
  179. if orphan_duplicates or invalid_folders:
  180. print(' For more information about the link data directories that were skipped, run:')
  181. print(' archivebox info')
  182. print(' archivebox list --status=invalid')
  183. print(' archivebox list --status=orphaned')
  184. print(' archivebox list --status=duplicate')
  185. write_main_index(list(all_links.values()), out_dir=OUTPUT_DIR)
  186. print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
  187. if existing_index:
  188. print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
  189. else:
  190. print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
  191. print()
  192. print(' To view your archive index, open:')
  193. print(' {}'.format(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME)))
  194. print()
  195. print(' To add new links, you can run:')
  196. print(" archivebox add 'https://example.com'")
  197. print()
  198. print(' For more usage and examples, run:')
  199. print(' archivebox help')
  200. @enforce_types
  201. def info():
  202. print('{green}[*] Scanning archive collection main index...{reset}'.format(**ANSI))
  203. print(f' {OUTPUT_DIR}/*')
  204. num_bytes, num_dirs, num_files = get_dir_size(OUTPUT_DIR, recursive=False, pattern='index.')
  205. size = human_readable_size(num_bytes)
  206. print(f' Size: {size} across {num_files} files')
  207. print()
  208. links = list(load_main_index(out_dir=OUTPUT_DIR))
  209. num_json_links = len(links)
  210. num_sql_links = sum(1 for link in parse_sql_main_index(out_dir=OUTPUT_DIR))
  211. num_html_links = sum(1 for url in parse_html_main_index(out_dir=OUTPUT_DIR))
  212. num_link_details = sum(1 for link in parse_json_links_details(out_dir=OUTPUT_DIR))
  213. users = get_admins().values_list('username', flat=True)
  214. print(f' > JSON Main Index: {num_json_links} links'.ljust(36), f'(found in {JSON_INDEX_FILENAME})')
  215. print(f' > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  216. print(f' > HTML Main Index: {num_html_links} links'.ljust(36), f'(found in {HTML_INDEX_FILENAME})')
  217. print(f' > JSON Link Details: {num_link_details} links'.ljust(36), f'(found in {ARCHIVE_DIR_NAME}/*/index.json)')
  218. print(f' > Admin: {len(users)} users {", ".join(users)}'.ljust(36), f'(found in {SQL_INDEX_FILENAME})')
  219. if num_html_links != len(links) or num_sql_links != len(links):
  220. print()
  221. print(' {lightred}Hint:{reset} You can fix index count differences automatically by running:'.format(**ANSI))
  222. print(' archivebox init')
  223. if not users:
  224. print()
  225. print(' {lightred}Hint:{reset} You can create an admin user by running:'.format(**ANSI))
  226. print(' archivebox manage createsuperuser')
  227. print()
  228. print('{green}[*] Scanning archive collection link data directories...{reset}'.format(**ANSI))
  229. print(f' {ARCHIVE_DIR}/*')
  230. num_bytes, num_dirs, num_files = get_dir_size(ARCHIVE_DIR)
  231. size = human_readable_size(num_bytes)
  232. print(f' Size: {size} across {num_files} files in {num_dirs} directories')
  233. print()
  234. num_indexed = len(get_indexed_folders(links, out_dir=OUTPUT_DIR))
  235. num_archived = len(get_archived_folders(links, out_dir=OUTPUT_DIR))
  236. num_unarchived = len(get_unarchived_folders(links, out_dir=OUTPUT_DIR))
  237. print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
  238. print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
  239. print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
  240. num_present = len(get_present_folders(links, out_dir=OUTPUT_DIR))
  241. num_valid = len(get_valid_folders(links, out_dir=OUTPUT_DIR))
  242. print()
  243. print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
  244. print(f' > valid: {num_valid}'.ljust(36), f'({get_valid_folders.__doc__})')
  245. duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
  246. orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
  247. corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
  248. unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
  249. num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
  250. print(f' > invalid: {num_invalid}'.ljust(36), f'({get_invalid_folders.__doc__})')
  251. print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
  252. print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
  253. print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
  254. print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
  255. if num_indexed:
  256. print()
  257. print(' {lightred}Hint:{reset} You can list link data directories by status like so:'.format(**ANSI))
  258. print(' archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)')
  259. if orphaned:
  260. print()
  261. print(' {lightred}Hint:{reset} To automatically import orphaned data directories into the main index, run:'.format(**ANSI))
  262. print(' archivebox init')
  263. if num_invalid:
  264. print()
  265. print(' {lightred}Hint:{reset} You may need to manually remove or fix some invalid data directories, afterwards make sure to run:'.format(**ANSI))
  266. print(' archivebox init')
  267. print()
  268. @enforce_types
  269. def update_archive_data(import_path: Optional[str]=None,
  270. resume: Optional[float]=None,
  271. only_new: bool=False,
  272. index_only: bool=False) -> List[Link]:
  273. """The main ArchiveBox entrancepoint. Everything starts here."""
  274. check_dependencies()
  275. check_data_folder()
  276. # Step 1: Load list of links from the existing index
  277. # merge in and dedupe new links from import_path
  278. all_links: List[Link] = []
  279. new_links: List[Link] = []
  280. all_links = load_main_index(out_dir=OUTPUT_DIR)
  281. if import_path:
  282. all_links, new_links = import_new_links(all_links, import_path)
  283. # Step 2: Write updated index with deduped old and new links back to disk
  284. write_main_index(links=list(all_links), out_dir=OUTPUT_DIR)
  285. if index_only:
  286. return all_links
  287. # Step 3: Run the archive methods for each link
  288. links = new_links if ONLY_NEW else all_links
  289. log_archiving_started(len(links), resume)
  290. idx: int = 0
  291. link: Link = None # type: ignore
  292. try:
  293. for idx, link in enumerate(links_after_timestamp(links, resume)):
  294. archive_link(link, out_dir=link.link_dir)
  295. except KeyboardInterrupt:
  296. log_archiving_paused(len(links), idx, link.timestamp if link else '0')
  297. raise SystemExit(0)
  298. except:
  299. print()
  300. raise
  301. log_archiving_finished(len(links))
  302. # Step 4: Re-write links index with updated titles, icons, and resources
  303. all_links = load_main_index(out_dir=OUTPUT_DIR)
  304. write_main_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
  305. return all_links
  306. LINK_FILTERS = {
  307. 'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
  308. 'substring': lambda link, pattern: pattern in link.url,
  309. 'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
  310. 'domain': lambda link, pattern: link.domain == pattern,
  311. }
  312. @enforce_types
  313. def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
  314. for pattern in filter_patterns:
  315. if LINK_FILTERS[filter_type](link, pattern):
  316. return True
  317. return False
  318. @enforce_types
  319. def list_archive_data(filter_patterns: Optional[List[str]]=None, filter_type: str='exact',
  320. after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
  321. all_links = load_main_index(out_dir=OUTPUT_DIR)
  322. for link in all_links:
  323. if after is not None and float(link.timestamp) < after:
  324. continue
  325. if before is not None and float(link.timestamp) > before:
  326. continue
  327. if filter_patterns:
  328. if link_matches_filter(link, filter_patterns, filter_type):
  329. yield link
  330. else:
  331. yield link
  332. @enforce_types
  333. def remove_archive_links(filter_patterns: List[str], filter_type: str='exact',
  334. after: Optional[float]=None, before: Optional[float]=None,
  335. yes: bool=False, delete: bool=False) -> List[Link]:
  336. check_dependencies()
  337. check_data_folder()
  338. log_list_started(filter_patterns, filter_type)
  339. timer = TimedProgress(360, prefix=' ')
  340. try:
  341. links = list(list_archive_data(
  342. filter_patterns=filter_patterns,
  343. filter_type=filter_type,
  344. after=after,
  345. before=before,
  346. ))
  347. finally:
  348. timer.end()
  349. if not len(links):
  350. log_removal_finished(0, 0)
  351. raise SystemExit(1)
  352. log_list_finished(links)
  353. log_removal_started(links, yes=yes, delete=delete)
  354. timer = TimedProgress(360, prefix=' ')
  355. try:
  356. to_keep = []
  357. all_links = load_main_index(out_dir=OUTPUT_DIR)
  358. for link in all_links:
  359. should_remove = (
  360. (after is not None and float(link.timestamp) < after)
  361. or (before is not None and float(link.timestamp) > before)
  362. or link_matches_filter(link, filter_patterns, filter_type)
  363. )
  364. if not should_remove:
  365. to_keep.append(link)
  366. elif should_remove and delete:
  367. shutil.rmtree(link.link_dir)
  368. finally:
  369. timer.end()
  370. write_main_index(links=to_keep, out_dir=OUTPUT_DIR, finished=True)
  371. log_removal_finished(len(all_links), len(to_keep))
  372. return to_keep
  373. def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  374. """indexed links without checking archive status or data directory validity"""
  375. return {
  376. link.link_dir: link
  377. for link in links
  378. }
  379. def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  380. """indexed links that are archived with a valid data directory"""
  381. return {
  382. link.link_dir: link
  383. for link in filter(is_archived, links)
  384. }
  385. def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  386. """indexed links that are unarchived with no data directory or an empty data directory"""
  387. return {
  388. link.link_dir: link
  389. for link in filter(is_unarchived, links)
  390. }
  391. def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  392. """dirs that are expected to exist based on the main index"""
  393. all_folders = {}
  394. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  395. if entry.is_dir(follow_symlinks=True):
  396. link = None
  397. try:
  398. link = parse_json_link_details(entry.path)
  399. except Exception:
  400. pass
  401. all_folders[entry.path] = link
  402. return all_folders
  403. def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  404. """dirs with a valid index matched to the main index and archived content"""
  405. return {
  406. link.link_dir: link
  407. for link in filter(is_valid, links)
  408. }
  409. def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  410. """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
  411. duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
  412. orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
  413. corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
  414. unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
  415. return {**duplicate, **orphaned, **corrupted, **unrecognized}
  416. def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  417. """dirs that conflict with other directories that have the same link URL or timestamp"""
  418. links = list(links)
  419. by_url = {link.url: 0 for link in links}
  420. by_timestamp = {link.timestamp: 0 for link in links}
  421. duplicate_folders = {}
  422. indexed_folders = {link.link_dir for link in links}
  423. data_folders = (
  424. entry.path
  425. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
  426. if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
  427. )
  428. for path in chain(sorted(indexed_folders), sorted(data_folders)):
  429. link = None
  430. try:
  431. link = parse_json_link_details(path)
  432. except Exception:
  433. pass
  434. if link:
  435. # link folder has same timestamp as different link folder
  436. by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
  437. if by_timestamp[link.timestamp] > 1:
  438. duplicate_folders[path] = link
  439. # link folder has same url as different link folder
  440. by_url[link.url] = by_url.get(link.url, 0) + 1
  441. if by_url[link.url] > 1:
  442. duplicate_folders[path] = link
  443. return duplicate_folders
  444. def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  445. """dirs that contain a valid index but aren't listed in the main index"""
  446. links = list(links)
  447. indexed_folders = {link.link_dir: link for link in links}
  448. orphaned_folders = {}
  449. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  450. if entry.is_dir(follow_symlinks=True):
  451. index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
  452. link = None
  453. try:
  454. link = parse_json_link_details(entry.path)
  455. except Exception:
  456. pass
  457. if index_exists and entry.path not in indexed_folders:
  458. # folder is a valid link data dir with index details, but it's not in the main index
  459. orphaned_folders[entry.path] = link
  460. return orphaned_folders
  461. def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  462. """dirs that don't contain a valid index and aren't listed in the main index"""
  463. return {
  464. link.link_dir: link
  465. for link in filter(is_corrupt, links)
  466. }
  467. def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
  468. """dirs that don't contain recognizable archive data and aren't listed in the main index"""
  469. by_timestamp = {link.timestamp: 0 for link in links}
  470. unrecognized_folders: Dict[str, Optional[Link]] = {}
  471. for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
  472. if entry.is_dir(follow_symlinks=True):
  473. index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
  474. link = None
  475. try:
  476. link = parse_json_link_details(entry.path)
  477. except Exception:
  478. pass
  479. if index_exists and link is None:
  480. # index exists but it's corrupted or unparseable
  481. unrecognized_folders[entry.path] = link
  482. elif not index_exists:
  483. # link details index doesn't exist and the folder isn't in the main index
  484. timestamp = entry.path.rsplit('/', 1)[-1]
  485. if timestamp not in by_timestamp:
  486. unrecognized_folders[entry.path] = link
  487. return unrecognized_folders
  488. def is_valid(link: Link) -> bool:
  489. dir_exists = os.path.exists(link.link_dir)
  490. index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
  491. if not dir_exists:
  492. # unarchived links are not included in the valid list
  493. return False
  494. if dir_exists and not index_exists:
  495. return False
  496. if dir_exists and index_exists:
  497. try:
  498. parsed_link = parse_json_link_details(link.link_dir)
  499. return link.url == parsed_link.url
  500. except Exception:
  501. pass
  502. return False
  503. def is_corrupt(link: Link) -> bool:
  504. if not os.path.exists(link.link_dir):
  505. # unarchived links are not considered corrupt
  506. return False
  507. if is_valid(link):
  508. return False
  509. return True
  510. def is_archived(link: Link) -> bool:
  511. return is_valid(link) and link.is_archived
  512. def is_unarchived(link: Link) -> bool:
  513. if not os.path.exists(link.link_dir):
  514. return True
  515. return not link.is_archived