Browse Source

Initial implementation

JDC 5 năm trước cách đây
mục cha
commit
b1f70b2197

+ 0 - 1
archivebox.egg-info

@@ -1 +0,0 @@
-pip_dist/archivebox.egg-info

+ 4 - 1
archivebox/core/admin.py

@@ -14,6 +14,9 @@ from django import forms
 from core.models import Snapshot, Tag
 from core.models import Snapshot, Tag
 from core.forms import AddLinkForm, TagField
 from core.forms import AddLinkForm, TagField
 
 
+from core.utils import get_icons
+from core.mixins import SearchResultsAdminMixin
+
 from index.html import snapshot_icons
 from index.html import snapshot_icons
 from util import htmldecode, urldecode, ansi_to_html
 from util import htmldecode, urldecode, ansi_to_html
 from logging_util import printable_filesize
 from logging_util import printable_filesize
@@ -82,7 +85,7 @@ class SnapshotAdminForm(forms.ModelForm):
         return instance
         return instance
 
 
 
 
-class SnapshotAdmin(admin.ModelAdmin):
+class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     list_display = ('added', 'title_str', 'url_str', 'files', 'size')
     sort_fields = ('title_str', 'url_str', 'added')
     sort_fields = ('title_str', 'url_str', 'added')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
     readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')

+ 21 - 0
archivebox/core/mixins.py

@@ -0,0 +1,21 @@
+from django.db.models import Q, Case, When, Value, IntegerField
+
+from archivebox.search import search_index
+
+class SearchResultsAdminMixin(object):
+    def get_search_results(self, request, queryset, search_term):
+        ''' Show exact match for title and slug at top of admin search results.
+        '''
+        qs, use_distinct = \
+            super(SearchResultsAdminMixin, self).get_search_results(
+                request, queryset, search_term)
+
+        search_term = search_term.strip()
+        if not search_term:
+            return qs, use_distinct
+
+        snapshot_ids = search_index(search_term)
+        qsearch = queryset.filter(id__in=snapshot_ids)
+        qs |= qsearch
+
+        return qs, use_distinct

+ 2 - 0
archivebox/extractors/__init__.py

@@ -23,6 +23,7 @@ from ..logging_util import (
     log_archive_method_started,
     log_archive_method_started,
     log_archive_method_finished,
     log_archive_method_finished,
 )
 )
+from ..search import write_search_index
 
 
 from .title import should_save_title, save_title
 from .title import should_save_title, save_title
 from .favicon import should_save_favicon, save_favicon
 from .favicon import should_save_favicon, save_favicon
@@ -107,6 +108,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                     link.history[method_name].append(result)
                     link.history[method_name].append(result)
 
 
                     stats[result.status] += 1
                     stats[result.status] += 1
+                    write_search_index(link=link, texts=result.index_texts)
                     log_archive_method_finished(result)
                     log_archive_method_finished(result)
                     if not skip_index:
                     if not skip_index:
                         ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
                         ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,

+ 5 - 2
archivebox/extractors/readability.py

@@ -71,6 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         CURL_BINARY,
         CURL_BINARY,
         link.url
         link.url
     ]
     ]
+    readability_content = None
     timer = TimedProgress(timeout, prefix='      ')
     timer = TimedProgress(timeout, prefix='      ')
     try:
     try:
         document = get_html(link, out_dir)
         document = get_html(link, out_dir)
@@ -86,8 +87,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result = run(cmd, cwd=out_dir, timeout=timeout)
         result_json = json.loads(result.stdout)
         result_json = json.loads(result.stdout)
         output_folder.mkdir(exist_ok=True)
         output_folder.mkdir(exist_ok=True)
+        readability_content = result_json.pop("textContent") 
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
         atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
-        atomic_write(str(output_folder / "content.txt"), result_json.pop("textContent"))
+        atomic_write(str(output_folder / "content.txt"), readability_content)
         atomic_write(str(output_folder / "article.json"), result_json)
         atomic_write(str(output_folder / "article.json"), result_json)
 
 
         # parse out number of files downloaded from last line of stderr:
         # parse out number of files downloaded from last line of stderr:
@@ -117,5 +119,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
         cmd_version=READABILITY_VERSION,
         cmd_version=READABILITY_VERSION,
         output=output,
         output=output,
         status=status,
         status=status,
-        **timer.stats,
+        index_texts= [readability_content] if readability_content else [],
+        **timer.stats,  
     )
     )

+ 1 - 0
archivebox/index/schema.py

@@ -39,6 +39,7 @@ class ArchiveResult:
     status: str
     status: str
     start_ts: datetime
     start_ts: datetime
     end_ts: datetime
     end_ts: datetime
+    index_texts: Union[List[str], None] = None
     schema: str = 'ArchiveResult'
     schema: str = 'ArchiveResult'
 
 
     def __post_init__(self):
     def __post_init__(self):

+ 40 - 0
archivebox/search/__init__.py

@@ -0,0 +1,40 @@
+from typing import List, Optional, Union
+from pathlib import Path
+
+from sonic import IngestClient, SearchClient
+
+from ..index.schema import Link, ArchiveResult
+from ..util import enforce_types
+from ..config import setup_django, OUTPUT_DIR
+
+
+@enforce_types
+def write_sonic_index(snapshot_id: str, texts: List[str]):
+    # TODO add variables to localhost, port, password, bucket, collection
+    with IngestClient("localhost", 1491, "SecretPassword") as ingestcl:
+        for text in texts:
+            ingestcl.push("archivebox", "snapshots", snapshot_id, str(text))
+
+@enforce_types
+def search_sonic_index(text: str) -> List:
+    with SearchClient("localhost", 1491, "SecretPassword") as querycl:
+        snap_ids = querycl.query("archivebox", "snapshots", text)
+    return snap_ids
+
+
+@enforce_types
+def search_index(text: str) -> List:
+    # get backend
+    return search_sonic_index(text)
+
+
+@enforce_types
+def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir: Path=OUTPUT_DIR, skip_text_index: bool=False) -> None:
+    setup_django(out_dir, check_db=True)
+    from core.models import Snapshot
+
+    if not skip_text_index and texts:
+        snap = Snapshot.objects.filter(url=link.url).first()
+        if snap:
+            # get backend
+            write_sonic_index(str(snap.id), texts)