test_extractors.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. from .fixtures import *
  2. import json as pyjson
  3. from archivebox.extractors import ignore_methods, get_default_archive_methods, should_save_title
  4. def test_wget_broken_pipe(tmp_path, process, disable_extractors_dict):
  5. disable_extractors_dict.update({"USE_WGET": "true"})
  6. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  7. capture_output=True, env=disable_extractors_dict)
  8. assert "TypeError chmod_file(..., path: str) got unexpected NoneType argument path=None" not in add_process.stdout.decode("utf-8")
  9. def test_ignore_methods():
  10. """
  11. Takes the passed method out of the default methods list and returns that value
  12. """
  13. ignored = ignore_methods(['title'])
  14. assert "title" not in ignored
  15. def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
  16. allow_list = {
  17. r'/static': ["headers", "singlefile"],
  18. r'example\.com\.html$': ["headers"],
  19. }
  20. deny_list = {
  21. "/static": ["singlefile"],
  22. }
  23. disable_extractors_dict.update({
  24. "SAVE_HEADERS": "true",
  25. "USE_SINGLEFILE": "true",
  26. "SAVE_ALLOWLIST": pyjson.dumps(allow_list),
  27. "SAVE_DENYLIST": pyjson.dumps(deny_list),
  28. })
  29. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  30. capture_output=True, env=disable_extractors_dict)
  31. archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
  32. singlefile_file = archived_item_path / "singlefile.html"
  33. assert not singlefile_file.exists()
  34. headers_file = archived_item_path / "headers.json"
  35. assert headers_file.exists()
  36. def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
  37. deny_list = {
  38. "/static": ["singlefile"],
  39. }
  40. disable_extractors_dict.update({
  41. "SAVE_HEADERS": "true",
  42. "USE_SINGLEFILE": "true",
  43. "SAVE_DENYLIST": pyjson.dumps(deny_list),
  44. })
  45. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  46. capture_output=True, env=disable_extractors_dict)
  47. archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
  48. singlefile_file = archived_item_path / "singlefile.html"
  49. assert not singlefile_file.exists()
  50. headers_file = archived_item_path / "headers.json"
  51. assert headers_file.exists()
  52. def test_singlefile_works(tmp_path, process, disable_extractors_dict):
  53. disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
  54. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  55. capture_output=True, env=disable_extractors_dict)
  56. archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
  57. output_file = archived_item_path / "singlefile.html"
  58. assert output_file.exists()
  59. def test_readability_works(tmp_path, process, disable_extractors_dict):
  60. disable_extractors_dict.update({"USE_READABILITY": "true"})
  61. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  62. capture_output=True, env=disable_extractors_dict)
  63. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  64. output_file = archived_item_path / "readability" / "content.html"
  65. assert output_file.exists()
  66. def test_mercury_works(tmp_path, process, disable_extractors_dict):
  67. disable_extractors_dict.update({"USE_MERCURY": "true"})
  68. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  69. capture_output=True, env=disable_extractors_dict)
  70. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  71. output_file = archived_item_path / "mercury" / "content.html"
  72. assert output_file.exists()
  73. def test_htmltotext_works(tmp_path, process, disable_extractors_dict):
  74. disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"})
  75. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  76. capture_output=True, env=disable_extractors_dict)
  77. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  78. output_file = archived_item_path / "htmltotext.txt"
  79. assert output_file.exists()
  80. def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict):
  81. disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"})
  82. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  83. capture_output=True, env=disable_extractors_dict)
  84. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  85. output_file = archived_item_path / "readability" / "content.html"
  86. assert output_file.exists()
  87. def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict):
  88. disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"})
  89. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  90. capture_output=True, env=disable_extractors_dict)
  91. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  92. output_file = archived_item_path / "readability" / "content.html"
  93. assert output_file.exists()
  94. def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
  95. disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"})
  96. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  97. capture_output=True, env=disable_extractors_dict)
  98. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  99. output_file = archived_item_path / "readability" / "content.html"
  100. assert output_file.exists()
  101. def test_use_node_false_disables_readability_and_singlefile(tmp_path, process, disable_extractors_dict):
  102. disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true", "USE_SINGLEFILE": "true", "USE_NODE": "false"})
  103. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
  104. capture_output=True, env=disable_extractors_dict)
  105. output_str = add_process.stdout.decode("utf-8")
  106. assert "> singlefile" not in output_str
  107. assert "> readability" not in output_str
  108. def test_headers_ignored(tmp_path, process, disable_extractors_dict):
  109. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
  110. capture_output=True, env=disable_extractors_dict)
  111. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  112. output_file = archived_item_path / "headers.json"
  113. assert not output_file.exists()
  114. def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
  115. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  116. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/headers/example.com.html'],
  117. capture_output=True, env=disable_extractors_dict)
  118. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  119. output_file = archived_item_path / "headers.json"
  120. assert output_file.exists()
  121. headers_file = archived_item_path / 'headers.json'
  122. with open(headers_file, 'r', encoding='utf-8') as f:
  123. headers = pyjson.load(f)
  124. assert headers['Content-Language'] == 'en'
  125. assert headers['Content-Script-Type'] == 'text/javascript'
  126. assert headers['Content-Style-Type'] == 'text/css'
  127. def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
  128. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  129. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/redirect/headers/example.com.html'],
  130. capture_output=True, env=disable_extractors_dict)
  131. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  132. output_file = archived_item_path / "headers.json"
  133. with open(output_file, 'r', encoding='utf-8') as f:
  134. headers = pyjson.load(f)
  135. assert headers['Content-Language'] == 'en'
  136. assert headers['Content-Script-Type'] == 'text/javascript'
  137. assert headers['Content-Style-Type'] == 'text/css'
  138. def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
  139. disable_extractors_dict.update({"SAVE_HEADERS": "true"})
  140. add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/400/example.com.html'],
  141. capture_output=True, env=disable_extractors_dict)
  142. archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
  143. output_file = archived_item_path / "headers.json"
  144. with open(output_file, 'r', encoding='utf-8') as f:
  145. headers = pyjson.load(f)
  146. assert headers["Status-Code"] == "200"