1 year ago · 4c5a3fba8b
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -174,13 +174,12 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
 
				 
			
 
				     # check for literally any file present that isnt an empty folder
			
 
				     domain_dir = Path(domain(link.url).replace(":", "+"))
			
 
				-    files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
			
 
				+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
			
 
				     if files_within:
			
 
				         return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
			
 
				 
			
 
				     # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
			
 
				     # that it's better we just pretend it doesnt exist
			
 
				-
			
 
				     # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
			
 
				     return None
			
 
				 
			
@@ -243,26 +242,24 @@ def wget_output_path(link: Link) -> Optional[str]:
 
				     try:
			
 
				         output_path = unsafe_wget_output_path(link)
			
 
				     except Exception as err:
			
 
				-        # print(err)
			
 
				         pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
			
 
				 
			
 
				-    
			
 
				     # check for unprintable unicode characters
			
 
				     # https://github.com/ArchiveBox/ArchiveBox/issues/1373
			
 
				     if output_path:
			
 
				         safe_path = output_path.encode('utf-8', 'replace').decode()
			
 
				-        
			
 
				         if output_path != safe_path:
			
 
				             # contains unprintable unicode characters that will break other parts of archivebox
			
 
				             # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
			
 
				             output_path = None
			
 
				 
			
 
				-
			
 
				     # check for a path that is just too long to safely handle across different OS's
			
 
				     # https://github.com/ArchiveBox/ArchiveBox/issues/549
			
 
				     if output_path and len(output_path) > 250:
			
 
				         output_path = None
			
 
				 
			
 
				+    if output_path:
			
 
				+        return output_path
			
 
				 
			
 
				     # fallback to just the domain dir
			
 
				     search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
			
@@ -274,5 +271,4 @@ def wget_output_path(link: Link) -> Optional[str]:
 
				     if search_dir.is_dir():
			
 
				         return domain(link.url).split(":", 1)[0]
			
 
				 
			
 
				-   
			
 
				     return None
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,8 @@ croniter==2.0.5
 
				 cryptography==42.0.7
			
 
				 dateparser==1.2.0
			
 
				 decorator==5.1.1
			
 
				-django==5.0.4
			
 
				+django==5.0.5
			
 
				+django-admin-data-views==0.3.1
			
 
				 django-auth-ldap==4.8.0
			
 
				 django-extensions==3.2.3
			
 
				 django-ninja==1.1.0