|
@@ -174,23 +174,29 @@ def wget_output_path(link: Link) -> Optional[str]:
|
|
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
|
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
|
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
|
search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+") / urldecode(full_path)
|
|
|
for _ in range(4):
|
|
for _ in range(4):
|
|
|
- if search_dir.exists():
|
|
|
|
|
- if search_dir.is_dir():
|
|
|
|
|
- html_files = [
|
|
|
|
|
- f for f in search_dir.iterdir()
|
|
|
|
|
- if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
|
|
|
|
- ]
|
|
|
|
|
- if html_files:
|
|
|
|
|
- return str(html_files[0].relative_to(link.link_dir))
|
|
|
|
|
-
|
|
|
|
|
- # sometimes wget'd URLs have no ext and return non-html
|
|
|
|
|
- # e.g. /some/example/rss/all -> some RSS XML content)
|
|
|
|
|
- # /some/other/url.o4g -> some binary unrecognized ext)
|
|
|
|
|
- # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
|
|
|
|
- last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
|
|
|
|
- for file_present in search_dir.iterdir():
|
|
|
|
|
- if file_present == last_part_of_url:
|
|
|
|
|
- return str((search_dir / file_present).relative_to(link.link_dir))
|
|
|
|
|
|
|
+ try:
|
|
|
|
|
+ if search_dir.exists():
|
|
|
|
|
+ if search_dir.is_dir():
|
|
|
|
|
+ html_files = [
|
|
|
|
|
+ f for f in search_dir.iterdir()
|
|
|
|
|
+ if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
|
|
|
|
|
+ ]
|
|
|
|
|
+ if html_files:
|
|
|
|
|
+ return str(html_files[0].relative_to(link.link_dir))
|
|
|
|
|
+
|
|
|
|
|
+ # sometimes wget'd URLs have no ext and return non-html
|
|
|
|
|
+ # e.g. /some/example/rss/all -> some RSS XML content)
|
|
|
|
|
+ # /some/other/url.o4g -> some binary unrecognized ext)
|
|
|
|
|
+ # test this with archivebox add --depth=1 https://getpocket.com/users/nikisweeting/feed/all
|
|
|
|
|
+ last_part_of_url = urldecode(full_path.rsplit('/', 1)[-1])
|
|
|
|
|
+ for file_present in search_dir.iterdir():
|
|
|
|
|
+ if file_present == last_part_of_url:
|
|
|
|
|
+ return str((search_dir / file_present).relative_to(link.link_dir))
|
|
|
|
|
+ except OSError:
|
|
|
|
|
+ # OSError 36 and others can happen here, caused by trying to check for impossible paths
|
|
|
|
|
+ # (paths derived from URLs can often contain illegal unicode characters or be too long,
|
|
|
|
|
+ # causing the OS / filesystem to reject trying to open them with a system-level error)
|
|
|
|
|
+ pass
|
|
|
|
|
|
|
|
# Move up one directory level
|
|
# Move up one directory level
|
|
|
search_dir = search_dir.parent
|
|
search_dir = search_dir.parent
|