Browse Source

more fixes for wget_output_path

Nick Sweeting 1 year ago
parent
commit
4c5a3fba8b
2 changed files with 5 additions and 8 deletions
  1. 3 7
      archivebox/extractors/wget.py
  2. 2 1
      requirements.txt

+ 3 - 7
archivebox/extractors/wget.py

@@ -174,13 +174,12 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
 
     # check for literally any file present that isnt an empty folder
     domain_dir = Path(domain(link.url).replace(":", "+"))
-    files_within = list((Path(link.link_dir) / domain_dir).glob('**/*.*'))
+    files_within = [path for path in (Path(link.link_dir) / domain_dir).glob('**/*.*') if not str(path).endswith('.orig')]
     if files_within:
         return str((domain_dir / files_within[-1]).relative_to(link.link_dir))
 
     # abandon all hope, wget either never downloaded, or it produced an output path so horribly mutilated
     # that it's better we just pretend it doesnt exist
-
     # this is why ArchiveBox's specializes in REDUNDANTLY saving copies of sites with multiple different tools
     return None
 
@@ -243,26 +242,24 @@ def wget_output_path(link: Link) -> Optional[str]:
     try:
         output_path = unsafe_wget_output_path(link)
     except Exception as err:
-        # print(err)
         pass           # better to pretend it just failed to download than expose gnarly OSErrors to users
 
-    
     # check for unprintable unicode characters
     # https://github.com/ArchiveBox/ArchiveBox/issues/1373
     if output_path:
         safe_path = output_path.encode('utf-8', 'replace').decode()
-        
         if output_path != safe_path:
             # contains unprintable unicode characters that will break other parts of archivebox
             # better to pretend it doesnt exist and fallback to parent dir than crash archivebox
             output_path = None
 
-
     # check for a path that is just too long to safely handle across different OS's
     # https://github.com/ArchiveBox/ArchiveBox/issues/549
     if output_path and len(output_path) > 250:
         output_path = None
 
+    if output_path:
+        return output_path
 
     # fallback to just the domain dir
     search_dir = Path(link.link_dir) / domain(link.url).replace(":", "+")
@@ -274,5 +271,4 @@ def wget_output_path(link: Link) -> Optional[str]:
     if search_dir.is_dir():
         return domain(link.url).split(":", 1)[0]
 
-   
     return None

+ 2 - 1
requirements.txt

@@ -15,7 +15,8 @@ croniter==2.0.5
 cryptography==42.0.7
 dateparser==1.2.0
 decorator==5.1.1
-django==5.0.4
+django==5.0.5
+django-admin-data-views==0.3.1
 django-auth-ldap==4.8.0
 django-extensions==3.2.3
 django-ninja==1.1.0