test_extractors.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. from .fixtures import *
  2. import json as pyjson
  3. from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
  4. def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
  5. disable_extractors_dict.update({"USE_WGET": "true"})
  6. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  7. capture_output=True, env=disable_extractors_dict)
  8. assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
  9. def test_ignore_methods():
  10. """
  11. Takes the passed method out of the default methods list and returns that value
  12. """
  13. ignored = ignore_methods(['title'])
  14. assert should_save_title not in ignored
  15. def test_singlefile_works(tmp_path, process, disable_extractors_dict):
  16. disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
  17. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  18. capture_output=True, env=disable_extractors_dict)
  19. archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
  20. output_file = archived_item_path / "singlefile.html"
  21. assert output_file.exists()
  22. def test_readability_works(tmp_path, process, disable_extractors_dict):
  23. disable_extractors_dict.update({"USE_READABILITY": "true"})
  24. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  25. capture_output=True, env=disable_extractors_dict)
  26. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  27. output_file = archived_item_path / "readability" / "content.html"
  28. assert output_file.exists()
  29. def test_mercury_works(tmp_path, process, disable_extractors_dict):
  30. disable_extractors_dict.update({"USE_MERCURY": "true"})
  31. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  32. capture_output=True, env=disable_extractors_dict)
  33. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  34. output_file = archived_item_path / "mercury" / "content.html"
  35. assert output_file.exists()
  36. def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict):
  37. disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"})
  38. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  39. capture_output=True, env=disable_extractors_dict)
  40. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  41. output_file = archived_item_path / "readability" / "content.html"
  42. assert output_file.exists()
  43. def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict):
  44. disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"})
  45. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  46. capture_output=True, env=disable_extractors_dict)
  47. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  48. output_file = archived_item_path / "readability" / "content.html"
  49. assert output_file.exists()
  50. def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
  51. disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"})
  52. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  53. capture_output=True, env=disable_extractors_dict)
  54. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  55. output_file = archived_item_path / "readability" / "content.html"
  56. assert output_file.exists()
  57. def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
  58. disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
  59. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  60. capture_output=True, env=disable_extractors_dict)
  61. output_str = add_process.stdout.decode("utf-8")
  62. assert "> singlefile" not in output_str
  63. assert "> readability" not in output_str
  64. def test_headers_ignored(tmp_path, process, disable_extractors_dict):
  65. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
  66. capture_output=True, env=disable_extractors_dict)
  67. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  68. output_file = archived_item_path / "headers.json"
  69. assert not output_file.exists()
  70. def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
  71. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  72. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
  73. capture_output=True, env=disable_extractors_dict)
  74. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  75. output_file = archived_item_path / "headers.json"
  76. assert output_file.exists()
  77. headers_file = archived_item_path / 'headers.json'
  78. with open(headers_file) as f:
  79. headers = pyjson.load(f)
  80. assert headers['Content-Language'] == 'en'
  81. assert headers['Content-Script-Type'] == 'text/javascript'
  82. assert headers['Content-Style-Type'] == 'text/css'
  83. def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
  84. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  85. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'],
  86. capture_output=True, env=disable_extractors_dict)
  87. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  88. output_file = archived_item_path / "headers.json"
  89. with open(output_file) as f:
  90. headers = pyjson.load(f)
  91. assert headers['Content-Language'] == 'en'
  92. assert headers['Content-Script-Type'] == 'text/javascript'
  93. assert headers['Content-Style-Type'] == 'text/css'
  94. def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
  95. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  96. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'],
  97. capture_output=True, env=disable_extractors_dict)
  98. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  99. output_file = archived_item_path / "headers.json"
  100. with open(output_file) as f:
  101. headers = pyjson.load(f)
  102. assert headers["Status-Code"] == "200"