2
0

forms.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. __package__ = 'archivebox.core'
  2. from django import forms
  3. from archivebox.misc.util import URL_REGEX
  4. from taggit.utils import edit_string_for_tags, parse_tags
  5. from archivebox.base_models.admin import KeyValueWidget
  6. DEPTH_CHOICES = (
  7. ('0', 'depth = 0 (archive just these URLs)'),
  8. ('1', 'depth = 1 (+ URLs one hop away)'),
  9. ('2', 'depth = 2 (+ URLs two hops away)'),
  10. ('3', 'depth = 3 (+ URLs three hops away)'),
  11. ('4', 'depth = 4 (+ URLs four hops away)'),
  12. )
  13. from archivebox.hooks import get_plugins
  14. def get_plugin_choices():
  15. """Get available extractor plugins from discovered hooks."""
  16. return [(name, name) for name in get_plugins()]
  17. class AddLinkForm(forms.Form):
  18. # Basic fields
  19. url = forms.RegexField(
  20. label="URLs (one per line)",
  21. regex=URL_REGEX,
  22. min_length='6',
  23. strip=True,
  24. widget=forms.Textarea,
  25. required=True
  26. )
  27. tag = forms.CharField(
  28. label="Tags (comma separated tag1,tag2,tag3)",
  29. strip=True,
  30. required=False,
  31. widget=forms.TextInput(attrs={
  32. 'list': 'tag-datalist',
  33. 'autocomplete': 'off',
  34. })
  35. )
  36. depth = forms.ChoiceField(
  37. label="Archive depth",
  38. choices=DEPTH_CHOICES,
  39. initial='0',
  40. widget=forms.RadioSelect(attrs={"class": "depth-selection"})
  41. )
  42. notes = forms.CharField(
  43. label="Notes",
  44. strip=True,
  45. required=False,
  46. widget=forms.Textarea(attrs={
  47. 'rows': 3,
  48. 'placeholder': 'Optional notes about this crawl (e.g., purpose, project name, context...)',
  49. })
  50. )
  51. # Plugin groups
  52. chrome_plugins = forms.MultipleChoiceField(
  53. label="Chrome-dependent plugins",
  54. required=False,
  55. widget=forms.CheckboxSelectMultiple,
  56. choices=[], # populated in __init__
  57. )
  58. archiving_plugins = forms.MultipleChoiceField(
  59. label="Archiving",
  60. required=False,
  61. widget=forms.CheckboxSelectMultiple,
  62. choices=[],
  63. )
  64. parsing_plugins = forms.MultipleChoiceField(
  65. label="Parsing",
  66. required=False,
  67. widget=forms.CheckboxSelectMultiple,
  68. choices=[],
  69. )
  70. search_plugins = forms.MultipleChoiceField(
  71. label="Search",
  72. required=False,
  73. widget=forms.CheckboxSelectMultiple,
  74. choices=[],
  75. )
  76. binary_plugins = forms.MultipleChoiceField(
  77. label="Binary providers",
  78. required=False,
  79. widget=forms.CheckboxSelectMultiple,
  80. choices=[],
  81. )
  82. extension_plugins = forms.MultipleChoiceField(
  83. label="Browser extensions",
  84. required=False,
  85. widget=forms.CheckboxSelectMultiple,
  86. choices=[],
  87. )
  88. # Advanced options
  89. schedule = forms.CharField(
  90. label="Repeat schedule",
  91. max_length=64,
  92. required=False,
  93. widget=forms.TextInput(attrs={
  94. 'placeholder': 'e.g., daily, weekly, 0 */6 * * * (every 6 hours)',
  95. })
  96. )
  97. persona = forms.CharField(
  98. label="Persona (authentication profile)",
  99. max_length=100,
  100. initial='Default',
  101. required=False,
  102. )
  103. overwrite = forms.BooleanField(
  104. label="Overwrite existing snapshots",
  105. initial=False,
  106. required=False,
  107. )
  108. update = forms.BooleanField(
  109. label="Update/retry previously failed URLs",
  110. initial=False,
  111. required=False,
  112. )
  113. index_only = forms.BooleanField(
  114. label="Index only (don't archive yet)",
  115. initial=False,
  116. required=False,
  117. )
  118. config = forms.JSONField(
  119. label="Custom config overrides",
  120. widget=KeyValueWidget(),
  121. initial=dict,
  122. required=False,
  123. )
  124. def __init__(self, *args, **kwargs):
  125. super().__init__(*args, **kwargs)
  126. # Import at runtime to avoid circular imports
  127. from archivebox.config.common import ARCHIVING_CONFIG
  128. # Get all plugins
  129. all_plugins = get_plugins()
  130. # Define plugin groups
  131. chrome_dependent = {
  132. 'accessibility', 'chrome', 'consolelog', 'dom', 'headers',
  133. 'parse_dom_outlinks', 'pdf', 'redirects', 'responses',
  134. 'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
  135. }
  136. archiving = {
  137. 'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
  138. 'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
  139. }
  140. parsing = {
  141. 'parse_html_urls', 'parse_jsonl_urls',
  142. 'parse_netscape_urls', 'parse_rss_urls', 'parse_txt_urls'
  143. }
  144. search = {
  145. 'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
  146. }
  147. binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
  148. extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
  149. # Populate plugin field choices
  150. self.fields['chrome_plugins'].choices = [
  151. (p, p) for p in sorted(all_plugins) if p in chrome_dependent
  152. ]
  153. self.fields['archiving_plugins'].choices = [
  154. (p, p) for p in sorted(all_plugins) if p in archiving
  155. ]
  156. self.fields['parsing_plugins'].choices = [
  157. (p, p) for p in sorted(all_plugins) if p in parsing
  158. ]
  159. self.fields['search_plugins'].choices = [
  160. (p, p) for p in sorted(all_plugins) if p in search
  161. ]
  162. self.fields['binary_plugins'].choices = [
  163. (p, p) for p in sorted(all_plugins) if p in binary
  164. ]
  165. self.fields['extension_plugins'].choices = [
  166. (p, p) for p in sorted(all_plugins) if p in extensions
  167. ]
  168. # Set update default from config
  169. self.fields['update'].initial = not ARCHIVING_CONFIG.ONLY_NEW
  170. def clean(self):
  171. cleaned_data = super().clean()
  172. # Combine all plugin groups into single list
  173. all_selected_plugins = []
  174. for field in ['chrome_plugins', 'archiving_plugins', 'parsing_plugins',
  175. 'search_plugins', 'binary_plugins', 'extension_plugins']:
  176. all_selected_plugins.extend(cleaned_data.get(field, []))
  177. # Store combined list for easy access
  178. cleaned_data['plugins'] = all_selected_plugins
  179. return cleaned_data
  180. class TagWidgetMixin:
  181. def format_value(self, value):
  182. if value is not None and not isinstance(value, str):
  183. value = edit_string_for_tags(value)
  184. return super().format_value(value)
  185. class TagWidget(TagWidgetMixin, forms.TextInput):
  186. pass
  187. class TagField(forms.CharField):
  188. widget = TagWidget
  189. def clean(self, value):
  190. value = super().clean(value)
  191. try:
  192. return parse_tags(value)
  193. except ValueError:
  194. raise forms.ValidationError(
  195. "Please provide a comma-separated list of tags."
  196. )
  197. def has_changed(self, initial_value, data_value):
  198. # Always return False if the field is disabled since self.bound_data
  199. # always uses the initial value in this case.
  200. if self.disabled:
  201. return False
  202. try:
  203. data_value = self.clean(data_value)
  204. except forms.ValidationError:
  205. pass
  206. if initial_value is None:
  207. initial_value = []
  208. initial_value = [tag.name for tag in initial_value]
  209. initial_value.sort()
  210. return initial_value != data_value