|
|
1 سال پیش | |
|---|---|---|
| .. | ||
| README.md | 1 سال پیش | |
| __init__.py | 1 سال پیش | |
| example_js_extractor.js | 1 سال پیش | |
| extractor.py | 1 سال پیش | |
Truths about Extractors:
Snapshot worker should handle:
Extractor Worker should handle:
extractor takes a URL as a CLI arg, a current working directory, and env var options (or config benedict)
SNAPSHOT ARCHIVING EVENTS:
SNAPSHOT_SKIPPED
Standardized Output files:
class FaviconResult(ArchiveResult):
dependencies: ClassVar[list[str]] = ['yt-dlp', 'curl', 'ffmpeg']
context: ClassVar[str] = 'shell' | 'puppeteer'
# snapshot: Snapshot
# extractor: str
# start_ts: datetime
# end_ts: datetime
# exit_code: int
# stdout: str
# stderr: str
# cmd: list[str]
# cmd_version: str
# config: dict
# status: str
# retry_at: datetime | None
# iface: NetworkInterface | None
# machine: Machine | None
# persona: Persona | None
class Meta:
verbose_name: str = 'Favicon'
verbose_name_plural: str = 'Favicons'
def save(...):
# if not self.output_files:
# self.output_files = self.get_output_files()
def get_cmd(self) -> list[str]:
binary = archivebox.pm.hook.get_BINARY('curl')
return [binary.name, '-fsSL', '-o', 'favicon.ico', domain_only(self.snapshot.url) + '/favicon.ico']
def get_cmd_version(self) -> str:
binary = archivebox.pm.hook.get_BINARY('curl')
return binary.version
def get_output_files(self) -> list[dict]:
output_files = {}
output_dirs = {}
for path in self.OUTPUT_DIR.rglob('*'):
if path.is_file():
output_files[str(path.relative_to(self.OUTPUT_DIR))] = {
'path': str(path.relative_to(self.OUTPUT_DIR)),
'hash_sha256': hash_file(path, 'sha256'),
'hash_blake3': hash_file(path, 'blake3'),
'size': path.stat().st_size,
'mimetype': detect_mimetype(path),
})
else:
output_dirs[str(path.relative_to(self.OUTPUT_DIR))] = {
'path': str(path.relative_to(self.OUTPUT_DIR)),
'hash_sha256': None,
'hash_blake3': None,
'size': None,
'mimetype': 'inode/directory',
})
for dir in output_dirs.values():
subfiles = {path: file for path, file in output_files.items() if path.startswith(dir['path'])}
dir['hash_sha256'] = hash_dir(dir['path'], 'sha256', subfiles)
dir['hash_blake3'] = hash_dir(dir['path'], 'blake3', subfiles)
dir['size'] = sum(file['size'] for file in subfiles.values())
return {**output_files, **output_dirs}
def get_output_text(self) -> str | None:
return 'favicon.ico'
def get_indexable_text(self) -> str | None:
return ''
def get_thumbnail(self) -> str | None:
width, height = get_image_dimensions(self.OUTPUT_DIR / 'favicon.png')
return {
'path': self.favicon_uri,
'abspath': self.OUTPUT_DIR / self.favicon_uri,
'width': width,
'height': height,
'mimetype': 'image/png',
'extension': 'png',
}
def get_icon(self) -> str | None:
return self.get_thumbnail()
def migrate_from_0_7_2(self) -> None:
"""Migrate output_dir generated by ArchiveBox <= 0.7.2 to current version"""
print(f'{type(self).__name__}[{self.ABID}].migrate_from_0_7_2()')
# move favicon.png -> self.OUTPUT_DIR / favicon.png
Migration:
- For each ArchiveResult:
- move it into subdir under name of the extractor + rename if needed
- calculate merkle tree of all files in the output_dir
- save the merkle tree to .merkle.json
- symlink old location -> new location for backwards compatibility
- For each Snapshot:
- move data/archive/<timestamp> -> data/archive/snapshots/<abid>
- symlink old location -> new location
class TitleResult(ArchiveResult):
dependencies: ClassVar[list[str]] = ['chrome', 'puppeteer']
context: ClassVar[str] = 'puppeteer'