| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- __package__ = 'archivebox.core'
- from django import forms
- from archivebox.misc.util import URL_REGEX
- from taggit.utils import edit_string_for_tags, parse_tags
- from archivebox.base_models.admin import KeyValueWidget
- DEPTH_CHOICES = (
- ('0', 'depth = 0 (archive just these URLs)'),
- ('1', 'depth = 1 (+ URLs one hop away)'),
- ('2', 'depth = 2 (+ URLs two hops away)'),
- ('3', 'depth = 3 (+ URLs three hops away)'),
- ('4', 'depth = 4 (+ URLs four hops away)'),
- )
- from archivebox.hooks import get_plugins
- def get_plugin_choices():
- """Get available extractor plugins from discovered hooks."""
- return [(name, name) for name in get_plugins()]
- class AddLinkForm(forms.Form):
- # Basic fields
- url = forms.RegexField(
- label="URLs (one per line)",
- regex=URL_REGEX,
- min_length='6',
- strip=True,
- widget=forms.Textarea,
- required=True
- )
- tag = forms.CharField(
- label="Tags (comma separated tag1,tag2,tag3)",
- strip=True,
- required=False,
- widget=forms.TextInput(attrs={
- 'list': 'tag-datalist',
- 'autocomplete': 'off',
- })
- )
- depth = forms.ChoiceField(
- label="Archive depth",
- choices=DEPTH_CHOICES,
- initial='0',
- widget=forms.RadioSelect(attrs={"class": "depth-selection"})
- )
- notes = forms.CharField(
- label="Notes",
- strip=True,
- required=False,
- widget=forms.Textarea(attrs={
- 'rows': 3,
- 'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
- })
- )
- # Plugin groups
- chrome_plugins = forms.MultipleChoiceField(
- label="Chrome-dependent plugins",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[], # populated in __init__
- )
- archiving_plugins = forms.MultipleChoiceField(
- label="Archiving",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[],
- )
- parsing_plugins = forms.MultipleChoiceField(
- label="Parsing",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[],
- )
- search_plugins = forms.MultipleChoiceField(
- label="Search",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[],
- )
- binary_plugins = forms.MultipleChoiceField(
- label="Binary providers",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[],
- )
- extension_plugins = forms.MultipleChoiceField(
- label="Browser extensions",
- required=False,
- widget=forms.CheckboxSelectMultiple,
- choices=[],
- )
- # Advanced options
- schedule = forms.CharField(
- label="Repeat schedule",
- max_length=64,
- required=False,
- widget=forms.TextInput(attrs={
- 'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
- })
- )
- persona = forms.CharField(
- label="Persona (authentication profile)",
- max_length=100,
- initial='Default',
- required=False,
- )
- overwrite = forms.BooleanField(
- label="Overwrite existing snapshots",
- initial=False,
- required=False,
- )
- update = forms.BooleanField(
- label="Update/retry previously failed URLs",
- initial=False,
- required=False,
- )
- index_only = forms.BooleanField(
- label="Index only (don't archive yet)",
- initial=False,
- required=False,
- )
- config = forms.JSONField(
- label="Custom config overrides",
- widget=KeyValueWidget(),
- initial=dict,
- required=False,
- )
- def __init__(self, *args, **kwargs):
- super().__init__(*args, **kwargs)
- # Import at runtime to avoid circular imports
- from archivebox.config.common import ARCHIVING_CONFIG
- # Get all plugins
- all_plugins = get_plugins()
- # Define plugin groups
- chrome_dependent = {
- 'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
- 'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
- 'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
- }
- archiving = {
- 'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
- 'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
- }
- parsing = {
- 'parse_html_urls', 'parse_jsonl_urls',
- 'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
- }
- search = {
- 'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
- }
- binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
- extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
- # Populate plugin field choices
- self.fields['chrome_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in chrome_dependent
- ]
- self.fields['archiving_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in archiving
- ]
- self.fields['parsing_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in parsing
- ]
- self.fields['search_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in search
- ]
- self.fields['binary_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in binary
- ]
- self.fields['extension_plugins'].choices = [
- (p, p) for p in sorted(all_plugins) if p in extensions
- ]
- # Set update default from config
- self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
- def clean(self):
- cleaned_data = super().clean()
- # Combine all plugin groups into single list
- all_selected_plugins = []
- for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
- 'search_plugins', 'binary_plugins', 'extension_plugins']:
- all_selected_plugins.extend(cleaned_data.get(field, []))
- # Store combined list for easy access
- cleaned_data['plugins'] = all_selected_plugins
- return cleaned_data
- class TagWidgetMixin:
- def format_value(self, value):
- if value is not None and not isinstance(value, str):
- value = edit_string_for_tags(value)
- return super().format_value(value)
- class TagWidget(TagWidgetMixin, forms.TextInput):
- pass
- class TagField(forms.CharField):
- widget = TagWidget
- def clean(self, value):
- value = super().clean(value)
- try:
- return parse_tags(value)
- except ValueError:
- raise forms.ValidationError(
- "Please provide a comma-separated list of tags."
- )
- def has_changed(self, initial_value, data_value):
- # Always return False if the field is disabled since self.bound_data
- # always uses the initial value in this case.
- if self.disabled:
- return False
- try:
- data_value = self.clean(data_value)
- except forms.ValidationError:
- pass
- if initial_value is None:
- initial_value = []
- initial_value = [tag.name for tag in initial_value]
- initial_value.sort()
- return initial_value != data_value
|