extractors.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # browsertrix extractor
  2. def save_browsertrix(link, out_dir, timeout, config):
  3. browsertrix_dir = out_dir / 'browsertrix'
  4. browsertrix_dir.mkdir(exist_ok=True)
  5. crawl_id = link.timestamp
  6. browsertrix_crawler_cmd = [
  7. 'crawl',
  8. f'--url', link.url,
  9. f'--collection={crawl_id}',
  10. '--scopeType=page',
  11. '--generateWACZ',
  12. '--text=final-to-warc',
  13. '--timeLimit=60',
  14. ]
  15. remote_cmd = """
  16. rm /tmp/dump.rdb;
  17. rm -rf /crawls/collections;
  18. mkdir /crawls/collections;
  19. env CRAWL_ID={crawl_id}
  20. """
  21. local_cmd = ['nc', 'browsertrix', '2222']
  22. status = 'succeeded'
  23. timer = TimedProgress(timeout, prefix=' ')
  24. try:
  25. result = run(local_cmd, cwd=str(out_dir), input=remote_cmd, timeout=timeout)
  26. cmd_output = result.stdout.decode()
  27. wacz_output_file = Path('/browsertrix/crawls') / crawl_id / f'{crawl_id}'.wacz
  28. copy_and_overwrite(wacz_output_file, browsertrix_dir / wacz_output_file.name)
  29. TEMPLATE = """
  30. """
  31. # rm /tmp/dump.rdb;
  32. # rm -rf /crawls/collections;
  33. # mkdir /crawls/collections;
  34. # env CRAWL_ID=tec2342 crawl --url 'https://example.com' --scopeType page --generateWACZ --collection tec2342 --text final-to-warc --timeLimit 60