utils.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from django.db.models import QuerySet
  2. from archivebox.util import enforce_types
  3. from archivebox.config import ANSI
  4. def log_index_started(url):
  5. print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
  6. print( )
  7. def get_file_result_content(res, extra_path, use_pwd=False):
  8. if use_pwd:
  9. fpath = f'{res.pwd}/{res.output}'
  10. else:
  11. fpath = f'{res.output}'
  12. if extra_path:
  13. fpath = f'{fpath}/{extra_path}'
  14. with open(fpath, 'r') as file:
  15. data = file.read()
  16. if data:
  17. return [data]
  18. return []
  19. # This should be abstracted by a plugin interface for extractors
  20. @enforce_types
  21. def get_indexable_content(results: QuerySet):
  22. if not results:
  23. return []
  24. # Only use the first method available
  25. res, method = results.first(), results.first().extractor
  26. if method not in ('readability', 'singlefile', 'dom', 'wget'):
  27. return []
  28. # This should come from a plugin interface
  29. # TODO: banish this duplication and get these from the extractor file
  30. if method == 'readability':
  31. return get_file_result_content(res, 'content.txt')
  32. elif method == 'singlefile':
  33. return get_file_result_content(res,'',use_pwd=True)
  34. elif method == 'dom':
  35. return get_file_result_content(res,'',use_pwd=True)
  36. elif method == 'wget':
  37. return get_file_result_content(res,'',use_pwd=True)