archivebox_init.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. __command__ = 'archivebox init'
  4. import sys
  5. import argparse
  6. from pathlib import Path
  7. from typing import Optional, List, IO
  8. from archivebox.misc.util import docstring
  9. from archivebox.config import DATA_DIR
  10. from archivebox.misc.logging_util import SmartFormatter, reject_stdin
  11. def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Path=DATA_DIR) -> None:
  12. """Initialize a new ArchiveBox collection in the current directory"""
  13. from core.models import Snapshot
  14. from rich import print
  15. # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
  16. # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
  17. # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
  18. is_empty = not len(set(os.listdir(out_dir)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
  19. existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
  20. if is_empty and not existing_index:
  21. print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
  22. print('[green]----------------------------------------------------------------------[/green]')
  23. elif existing_index:
  24. # TODO: properly detect and print the existing version in current index as well
  25. print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
  26. print('[green]----------------------------------------------------------------------[/green]')
  27. else:
  28. if force:
  29. print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
  30. print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
  31. else:
  32. print(
  33. ("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
  34. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  35. " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
  36. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  37. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  38. )
  39. )
  40. raise SystemExit(2)
  41. if existing_index:
  42. print('\n[green][*] Verifying archive folder structure...[/green]')
  43. else:
  44. print('\n[green][+] Building archive folder structure...[/green]')
  45. print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
  46. Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
  47. Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
  48. Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
  49. print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
  50. # create the .archivebox_id file with a unique ID for this collection
  51. from archivebox.config.paths import _get_collection_id
  52. _get_collection_id(CONSTANTS.DATA_DIR, force_create=True)
  53. # create the ArchiveBox.conf file
  54. write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
  55. if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
  56. print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
  57. else:
  58. print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
  59. for migration_line in apply_migrations(out_dir):
  60. sys.stdout.write(f' {migration_line}\n')
  61. assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
  62. print()
  63. print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
  64. # from django.contrib.auth.models import User
  65. # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
  66. # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
  67. # call_command("createsuperuser", interactive=True)
  68. print()
  69. print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
  70. all_links = Snapshot.objects.none()
  71. pending_links: Dict[str, Link] = {}
  72. if existing_index:
  73. all_links = load_main_index(out_dir=out_dir, warn=False)
  74. print(f' √ Loaded {all_links.count()} links from existing main index.')
  75. if quick:
  76. print(' > Skipping full snapshot directory check (quick mode)')
  77. else:
  78. try:
  79. # Links in data folders that dont match their timestamp
  80. fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
  81. if fixed:
  82. print(f' [yellow]√ Fixed {len(fixed)} data directory locations that didn\'t match their link timestamps.[/yellow]')
  83. if cant_fix:
  84. print(f' [red]! Could not fix {len(cant_fix)} data directory locations due to conflicts with existing folders.[/red]')
  85. # Links in JSON index but not in main index
  86. orphaned_json_links = {
  87. link.url: link
  88. for link in parse_json_main_index(out_dir)
  89. if not all_links.filter(url=link.url).exists()
  90. }
  91. if orphaned_json_links:
  92. pending_links.update(orphaned_json_links)
  93. print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
  94. # Links in data dir indexes but not in main index
  95. orphaned_data_dir_links = {
  96. link.url: link
  97. for link in parse_json_links_details(out_dir)
  98. if not all_links.filter(url=link.url).exists()
  99. }
  100. if orphaned_data_dir_links:
  101. pending_links.update(orphaned_data_dir_links)
  102. print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
  103. # Links in invalid/duplicate data dirs
  104. invalid_folders = {
  105. folder: link
  106. for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
  107. }
  108. if invalid_folders:
  109. print(f' [red]! Skipped adding {len(invalid_folders)} invalid link data directories.[/red]')
  110. print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(DATA_DIR)} {link}' for folder, link in invalid_folders.items()))
  111. print()
  112. print(' [violet]Hint:[/violet] For more information about the link data directories that were skipped, run:')
  113. print(' archivebox status')
  114. print(' archivebox list --status=invalid')
  115. except (KeyboardInterrupt, SystemExit):
  116. print(file=sys.stderr)
  117. print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
  118. print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
  119. print(file=sys.stderr)
  120. print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
  121. print(' archivebox init --quick', file=sys.stderr)
  122. raise SystemExit(1)
  123. write_main_index(list(pending_links.values()), out_dir=out_dir)
  124. print('\n[green]----------------------------------------------------------------------[/green]')
  125. from django.contrib.auth.models import User
  126. if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
  127. print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
  128. User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
  129. if existing_index:
  130. print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
  131. else:
  132. print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
  133. json_index = out_dir / CONSTANTS.JSON_INDEX_FILENAME
  134. html_index = out_dir / CONSTANTS.HTML_INDEX_FILENAME
  135. index_name = f"{date.today()}_index_old"
  136. if os.access(json_index, os.F_OK):
  137. json_index.rename(f"{index_name}.json")
  138. if os.access(html_index, os.F_OK):
  139. html_index.rename(f"{index_name}.html")
  140. CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
  141. CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
  142. CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
  143. from archivebox.config.common import STORAGE_CONFIG
  144. STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
  145. STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
  146. if install:
  147. run_subcommand('install', pwd=out_dir)
  148. if Snapshot.objects.count() < 25: # hide the hints for experienced users
  149. print()
  150. print(' [violet]Hint:[/violet] To view your archive index, run:')
  151. print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
  152. print()
  153. print(' To add new links, you can run:')
  154. print(" archivebox add < ~/some/path/to/list_of_links.txt")
  155. print()
  156. print(' For more usage and examples, run:')
  157. print(' archivebox help')
  158. @docstring(init.__doc__)
  159. def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
  160. parser = argparse.ArgumentParser(
  161. prog=__command__,
  162. description=init.__doc__,
  163. add_help=True,
  164. formatter_class=SmartFormatter,
  165. )
  166. parser.add_argument(
  167. '--force', # '-f',
  168. action='store_true',
  169. help='Ignore unrecognized files in current directory and initialize anyway',
  170. )
  171. parser.add_argument(
  172. '--quick', '-q',
  173. action='store_true',
  174. help='Run any updates or migrations without rechecking all snapshot dirs',
  175. )
  176. parser.add_argument(
  177. '--install', #'-s',
  178. action='store_true',
  179. help='Automatically install dependencies and extras used for archiving',
  180. )
  181. parser.add_argument(
  182. '--setup', #'-s',
  183. action='store_true',
  184. help='DEPRECATED: equivalent to --install',
  185. )
  186. command = parser.parse_args(args or ())
  187. reject_stdin(__command__, stdin)
  188. init(
  189. force=command.force,
  190. quick=command.quick,
  191. install=command.install or command.setup,
  192. out_dir=pwd or DATA_DIR,
  193. )
  194. if __name__ == '__main__':
  195. main(args=sys.argv[1:], stdin=sys.stdin)