archivebox_init.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. #!/usr/bin/env python3
  2. __package__ = 'archivebox.cli'
  3. import os
  4. import sys
  5. from pathlib import Path
  6. from rich import print
  7. import rich_click as click
  8. from archivebox.misc.util import docstring, enforce_types
  9. @enforce_types
  10. def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
  11. """Initialize a new ArchiveBox collection in the current directory"""
  12. from archivebox.config import CONSTANTS, VERSION, DATA_DIR
  13. from archivebox.config.common import SERVER_CONFIG
  14. from archivebox.config.collection import write_config_file
  15. from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
  16. from archivebox.misc.db import apply_migrations
  17. # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
  18. # print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
  19. # print("[red] You can run `archivebox list --json --with-headers > static_index.json` to manually generate it.[/red]", file=sys.stderr)
  20. is_empty = not len(set(os.listdir(DATA_DIR)) - CONSTANTS.ALLOWED_IN_DATA_DIR)
  21. existing_index = os.path.isfile(CONSTANTS.DATABASE_FILE)
  22. if is_empty and not existing_index:
  23. print(f'[turquoise4][+] Initializing a new ArchiveBox v{VERSION} collection...[/turquoise4]')
  24. print('[green]----------------------------------------------------------------------[/green]')
  25. elif existing_index:
  26. # TODO: properly detect and print the existing version in current index as well
  27. print(f'[green][*] Verifying and updating existing ArchiveBox collection to v{VERSION}...[/green]')
  28. print('[green]----------------------------------------------------------------------[/green]')
  29. else:
  30. if force:
  31. print('[red][!] This folder appears to already have files in it, but no index.sqlite3 is present.[/red]')
  32. print('[red] Because --force was passed, ArchiveBox will initialize anyway (which may overwrite existing files).[/red]')
  33. else:
  34. print(
  35. ("[red][X] This folder appears to already have files in it, but no index.sqlite3 present.[/red]\n\n"
  36. " You must run init in a completely empty directory, or an existing data folder.\n\n"
  37. " [violet]Hint:[/violet] To import an existing data folder make sure to cd into the folder first, \n"
  38. " then run and run 'archivebox init' to pick up where you left off.\n\n"
  39. " (Always make sure your data folder is backed up first before updating ArchiveBox)"
  40. )
  41. )
  42. raise SystemExit(2)
  43. if existing_index:
  44. print('\n[green][*] Verifying archive folder structure...[/green]')
  45. else:
  46. print('\n[green][+] Building archive folder structure...[/green]')
  47. print(f' + ./{CONSTANTS.ARCHIVE_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.SOURCES_DIR.relative_to(DATA_DIR)}, ./{CONSTANTS.LOGS_DIR.relative_to(DATA_DIR)}...')
  48. Path(CONSTANTS.SOURCES_DIR).mkdir(exist_ok=True)
  49. Path(CONSTANTS.ARCHIVE_DIR).mkdir(exist_ok=True)
  50. Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
  51. print(f' + ./{CONSTANTS.CONFIG_FILE.relative_to(DATA_DIR)}...')
  52. # create the .archivebox_id file with a unique ID for this collection
  53. from archivebox.config.paths import _get_collection_id
  54. _get_collection_id(DATA_DIR, force_create=True)
  55. # create the ArchiveBox.conf file
  56. write_config_file({'SECRET_KEY': SERVER_CONFIG.SECRET_KEY})
  57. if os.access(CONSTANTS.DATABASE_FILE, os.F_OK):
  58. print('\n[green][*] Verifying main SQL index and running any migrations needed...[/green]')
  59. else:
  60. print('\n[green][+] Building main SQL index and running initial migrations...[/green]')
  61. from archivebox.config.django import setup_django
  62. setup_django()
  63. for migration_line in apply_migrations(DATA_DIR):
  64. sys.stdout.write(f' {migration_line}\n')
  65. assert os.path.isfile(CONSTANTS.DATABASE_FILE) and os.access(CONSTANTS.DATABASE_FILE, os.R_OK)
  66. print()
  67. print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
  68. # from django.contrib.auth.models import User
  69. # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
  70. # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
  71. # call_command("createsuperuser", interactive=True)
  72. print()
  73. print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
  74. from archivebox.core.models import Snapshot
  75. all_links = Snapshot.objects.none()
  76. pending_links: dict[str, SnapshotDict] = {}
  77. if existing_index:
  78. all_links = Snapshot.objects.all()
  79. print(f' √ Loaded {all_links.count()} links from existing main index.')
  80. if quick:
  81. print(' > Skipping orphan snapshot import (quick mode)')
  82. else:
  83. try:
  84. # Import orphaned links from legacy JSON indexes
  85. orphaned_json_links = {
  86. link_dict['url']: link_dict
  87. for link_dict in parse_json_main_index(DATA_DIR)
  88. if not all_links.filter(url=link_dict['url']).exists()
  89. }
  90. if orphaned_json_links:
  91. pending_links.update(orphaned_json_links)
  92. print(f' [yellow]√ Added {len(orphaned_json_links)} orphaned links from existing JSON index...[/yellow]')
  93. orphaned_data_dir_links = {
  94. link_dict['url']: link_dict
  95. for link_dict in parse_json_links_details(DATA_DIR)
  96. if not all_links.filter(url=link_dict['url']).exists()
  97. }
  98. if orphaned_data_dir_links:
  99. pending_links.update(orphaned_data_dir_links)
  100. print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
  101. if pending_links:
  102. for link_dict in pending_links.values():
  103. Snapshot.from_jsonl(link_dict)
  104. # Hint for orphaned snapshot directories
  105. print()
  106. print(' [violet]Hint:[/violet] To import orphaned snapshot directories and reconcile filesystem state, run:')
  107. print(' archivebox update')
  108. except (KeyboardInterrupt, SystemExit):
  109. print(file=sys.stderr)
  110. print('[yellow]:stop_sign: Stopped checking archive directories due to Ctrl-C/SIGTERM[/yellow]', file=sys.stderr)
  111. print(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.', file=sys.stderr)
  112. print(file=sys.stderr)
  113. print(' [violet]Hint:[/violet] In the future you can run a quick init without checking dirs like so:', file=sys.stderr)
  114. print(' archivebox init --quick', file=sys.stderr)
  115. raise SystemExit(1)
  116. print('\n[green]----------------------------------------------------------------------[/green]')
  117. from django.contrib.auth.models import User
  118. if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
  119. print('[green][+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.[/green]')
  120. User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
  121. if existing_index:
  122. print('[green][√] Done. Verified and updated the existing ArchiveBox collection.[/green]')
  123. else:
  124. print(f'[green][√] Done. A new ArchiveBox collection was initialized ({len(all_links) + len(pending_links)} links).[/green]')
  125. CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
  126. CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
  127. CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
  128. from archivebox.config.common import STORAGE_CONFIG
  129. STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
  130. STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
  131. if install:
  132. from archivebox.cli.archivebox_install import install as install_method
  133. install_method()
  134. if Snapshot.objects.count() < 25: # hide the hints for experienced users
  135. print()
  136. print(' [violet]Hint:[/violet] To view your archive index, run:')
  137. print(' archivebox server # then visit [deep_sky_blue4][link=http://127.0.0.1:8000]http://127.0.0.1:8000[/link][/deep_sky_blue4]')
  138. print()
  139. print(' To add new links, you can run:')
  140. print(" archivebox add < ~/some/path/to/list_of_links.txt")
  141. print()
  142. print(' For more usage and examples, run:')
  143. print(' archivebox help')
  144. @click.command()
  145. @click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
  146. @click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
  147. @click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
  148. @docstring(init.__doc__)
  149. def main(**kwargs) -> None:
  150. init(**kwargs)
  151. if __name__ == '__main__':
  152. main()