__init__.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. __package__ = 'archivebox.search'
  2. from pathlib import Path
  3. from typing import List, Union
  4. from django.db.models import QuerySet
  5. from django.conf import settings
  6. import abx
  7. import archivebox
  8. from archivebox.index.schema import Link
  9. from archivebox.misc.util import enforce_types
  10. from archivebox.misc.logging import stderr
  11. from archivebox.config.common import SEARCH_BACKEND_CONFIG
  12. def log_index_started(url):
  13. print('[green][*] Indexing url: {} in the search index[/]'.format(url))
  14. print( )
  15. def get_file_result_content(res, extra_path, use_pwd=False):
  16. if use_pwd:
  17. fpath = f'{res.pwd}/{res.output}'
  18. else:
  19. fpath = f'{res.output}'
  20. if extra_path:
  21. fpath = f'{fpath}/{extra_path}'
  22. with open(fpath, 'r', encoding='utf-8') as file:
  23. data = file.read()
  24. if data:
  25. return [data]
  26. return []
  27. # TODO: This should be abstracted by a plugin interface for extractors
  28. @enforce_types
  29. def get_indexable_content(results: QuerySet):
  30. if not results:
  31. return []
  32. # Only use the first method available
  33. res, method = results.first(), results.first().extractor
  34. if method not in ('readability', 'singlefile', 'dom', 'wget'):
  35. return []
  36. # This should come from a plugin interface
  37. # TODO: banish this duplication and get these from the extractor file
  38. if method == 'readability':
  39. return get_file_result_content(res, 'content.txt', use_pwd=True)
  40. elif method == 'singlefile':
  41. return get_file_result_content(res, '', use_pwd=True)
  42. elif method == 'dom':
  43. return get_file_result_content(res, '', use_pwd=True)
  44. elif method == 'wget':
  45. return get_file_result_content(res, '', use_pwd=True)
  46. def import_backend():
  47. for backend in abx.as_dict(archivebox.pm.hook.get_SEARCHBACKENDS()).values():
  48. if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE:
  49. return backend
  50. raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend')
  51. @enforce_types
  52. def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=settings.DATA_DIR, skip_text_index: bool=False) -> None:
  53. if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND:
  54. return
  55. if not skip_text_index and texts:
  56. from core.models import Snapshot
  57. snap = Snapshot.objects.filter(url=link.url).first()
  58. backend = import_backend()
  59. if snap:
  60. try:
  61. backend.index(snapshot_id=str(snap.pk), texts=texts)
  62. except Exception as err:
  63. stderr()
  64. stderr(
  65. f'[X] The search backend threw an exception={err}:',
  66. color='red',
  67. )
  68. @enforce_types
  69. def query_search_index(query: str, out_dir: Path=settings.DATA_DIR) -> QuerySet:
  70. from core.models import Snapshot
  71. if SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
  72. backend = import_backend()
  73. try:
  74. snapshot_pks = backend.search(query)
  75. except Exception as err:
  76. stderr()
  77. stderr(
  78. f'[X] The search backend threw an exception={err}:',
  79. color='red',
  80. )
  81. raise
  82. else:
  83. # TODO preserve ordering from backend
  84. qsearch = Snapshot.objects.filter(pk__in=snapshot_pks)
  85. return qsearch
  86. return Snapshot.objects.none()
  87. @enforce_types
  88. def flush_search_index(snapshots: QuerySet):
  89. if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
  90. return
  91. backend = import_backend()
  92. snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
  93. try:
  94. backend.flush(snapshot_pks)
  95. except Exception as err:
  96. stderr()
  97. stderr(
  98. f'[X] The search backend threw an exception={err}:',
  99. color='red',
  100. )
  101. @enforce_types
  102. def index_links(links: Union[List[Link],None], out_dir: Path=settings.DATA_DIR):
  103. if not links:
  104. return
  105. from core.models import Snapshot, ArchiveResult
  106. for link in links:
  107. snap = Snapshot.objects.filter(url=link.url).first()
  108. if snap:
  109. results = ArchiveResult.objects.indexable().filter(snapshot=snap)
  110. log_index_started(link.url)
  111. try:
  112. texts = get_indexable_content(results)
  113. except Exception as err:
  114. stderr()
  115. stderr(
  116. f'[X] An Exception ocurred reading the indexable content={err}:',
  117. color='red',
  118. )
  119. else:
  120. write_search_index(link, texts, out_dir=out_dir)