|
@@ -7,7 +7,7 @@ from datetime import datetime
|
|
|
from index import (
|
|
from index import (
|
|
|
parse_json_link_index,
|
|
parse_json_link_index,
|
|
|
write_link_index,
|
|
write_link_index,
|
|
|
- patch_index_title_hack,
|
|
|
|
|
|
|
+ update_main_index,
|
|
|
)
|
|
)
|
|
|
from config import (
|
|
from config import (
|
|
|
CURL_BINARY,
|
|
CURL_BINARY,
|
|
@@ -103,7 +103,9 @@ def archive_link(link_dir, link, overwrite=True):
|
|
|
for archive_method in active_methods:
|
|
for archive_method in active_methods:
|
|
|
archive_method(link_dir, link, overwrite=overwrite)
|
|
archive_method(link_dir, link, overwrite=overwrite)
|
|
|
|
|
|
|
|
|
|
+
|
|
|
write_link_index(link_dir, link)
|
|
write_link_index(link_dir, link)
|
|
|
|
|
+ update_main_index(link)
|
|
|
|
|
|
|
|
except Exception as err:
|
|
except Exception as err:
|
|
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
|
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
|
@@ -218,7 +220,7 @@ def fetch_wget(link_dir, link, requisites=FETCH_WGET_REQUISITES, warc=FETCH_WARC
|
|
|
try:
|
|
try:
|
|
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
|
result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
|
|
end()
|
|
end()
|
|
|
- output = wget_output_path(link, look_in=domain_dir)
|
|
|
|
|
|
|
+ output = wget_output_path(link)
|
|
|
|
|
|
|
|
output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
|
|
output_tail = [' ' + line for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] if line.strip()]
|
|
|
|
|
|
|
@@ -391,11 +393,13 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|
|
output = 'archive.org.txt'
|
|
output = 'archive.org.txt'
|
|
|
archive_org_url = None
|
|
archive_org_url = None
|
|
|
|
|
|
|
|
|
|
+
|
|
|
path = os.path.join(link_dir, output)
|
|
path = os.path.join(link_dir, output)
|
|
|
if os.path.exists(path):
|
|
if os.path.exists(path):
|
|
|
archive_org_url = open(path, 'r').read().strip()
|
|
archive_org_url = open(path, 'r').read().strip()
|
|
|
return {'output': archive_org_url, 'status': 'skipped'}
|
|
return {'output': archive_org_url, 'status': 'skipped'}
|
|
|
|
|
|
|
|
|
|
+
|
|
|
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
|
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
|
|
CMD = [
|
|
CMD = [
|
|
|
CURL_BINARY,
|
|
CURL_BINARY,
|
|
@@ -412,7 +416,6 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|
|
end()
|
|
end()
|
|
|
|
|
|
|
|
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
|
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
|
|
-
|
|
|
|
|
if content_location:
|
|
if content_location:
|
|
|
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
|
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
|
|
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
|
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
|
@@ -427,6 +430,7 @@ def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
|
|
output = e
|
|
output = e
|
|
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
|
print_error_hints(cmd=CMD, pwd=link_dir, err=e)
|
|
|
|
|
|
|
|
|
|
+
|
|
|
if not isinstance(output, Exception):
|
|
if not isinstance(output, Exception):
|
|
|
# instead of writing None when archive.org rejects the url write the
|
|
# instead of writing None when archive.org rejects the url write the
|
|
|
# url to resubmit it to archive.org. This is so when the user visits
|
|
# url to resubmit it to archive.org. This is so when the user visits
|
|
@@ -499,7 +503,6 @@ def fetch_title(link_dir, link, timeout=TIMEOUT):
|
|
|
# TODO: figure out how to do this without gnarly string replacement
|
|
# TODO: figure out how to do this without gnarly string replacement
|
|
|
if title:
|
|
if title:
|
|
|
link['title'] = title
|
|
link['title'] = title
|
|
|
- patch_index_title_hack(link['url'], title)
|
|
|
|
|
|
|
|
|
|
return {
|
|
return {
|
|
|
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|
|
'cmd': 'fetch_page_title("{}")'.format(link['url']),
|