Browse Source

Add parser for Pocket API

Pass a url like `pocket://Username` to import that username's archived Pocket
library. Tokens need to be stored in ArchveBox.conf with the following keys:

```
POCKET_CONSUMER_KEY = key-from-custom-pocket-app
POCKET_ACCESS_TOKENS = {"YourUsername": "pocket-token-for-app"}
```

`POCKET_ACCESS_TOKENS` MUST be on a single line, or the JSON will be
misinterpreted by the parser as a new key/value pair.
mAAdhaTTah 5 years ago
parent
commit
ac7ad9e942
4 changed files with 122 additions and 1 deletions
  1. 4 1
      archivebox/config.py
  2. 2 0
      archivebox/parsers/__init__.py
  3. 115 0
      archivebox/parsers/pocket_api.py
  4. 1 0
      setup.py

+ 4 - 1
archivebox/config.py

@@ -159,6 +159,9 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
         'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
         'NODE_BINARY':              {'type': str,   'default': 'node'},
         'CHROME_BINARY':            {'type': str,   'default': None},
+
+        'POCKET_CONSUMER_KEY':      {'type': str,   'default': None},
+        'POCKET_ACCESS_TOKENS':     {'type': dict,  'default': {}},
     },
 }
 
@@ -386,7 +389,7 @@ def load_config_val(key: str,
             raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
         return int(val)
 
-    elif type is list:
+    elif type is list or type is dict:
         return json.loads(val)
 
     raise Exception('Config values can only be str, bool, int or json')

+ 2 - 0
archivebox/parsers/__init__.py

@@ -32,6 +32,7 @@ from ..index.schema import Link
 from ..logging_util import TimedProgress, log_source_saved
 
 from .pocket_html import parse_pocket_html_export
+from .pocket_api import parse_pocket_api_export
 from .pinboard_rss import parse_pinboard_rss_export
 from .wallabag_atom import parse_wallabag_atom_export
 from .shaarli_rss import parse_shaarli_rss_export
@@ -44,6 +45,7 @@ from .generic_txt import parse_generic_txt_export
 
 PARSERS = (
     # Specialized parsers
+    ('Pocket API', parse_pocket_api_export),
     ('Wallabag ATOM', parse_wallabag_atom_export),
     ('Pocket HTML', parse_pocket_html_export),
     ('Pinboard RSS', parse_pinboard_rss_export),

+ 115 - 0
archivebox/parsers/pocket_api.py

@@ -0,0 +1,115 @@
+__package__ = 'archivebox.parsers'
+
+
+import re
+
+from typing import IO, Iterable, Optional
+from datetime import datetime
+from configparser import ConfigParser
+
+from pathlib import Path
+from pocket import Pocket
+import requests
+
+from ..index.schema import Link
+from ..util import (
+    enforce_types,
+)
+from ..config import (
+    SOURCES_DIR
+)
+
+_COUNT_PER_PAGE = 500
+_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
+
+# search for broken protocols that sometimes come from the Pocket API
+_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
+
+def get_pocket_articles(api: Pocket, since=None, page=0):
+    body, headers = api.get(
+        state='archive',
+        sort='oldest',
+        since=since,
+        count=_COUNT_PER_PAGE,
+        offset=page * _COUNT_PER_PAGE,
+    )
+
+    articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
+    returned_count = len(articles)
+
+    yield from articles
+
+    if returned_count == _COUNT_PER_PAGE:
+        yield from get_pocket_articles(api, since=since, page=page + 1)
+    else:
+        api.last_since = body['since']
+
+
+def link_from_article(article: dict, sources: list):
+    url: str = article['resolved_url'] or article['given_url']
+    broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
+    if broken_protocol:
+        url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
+    title = article['resolved_title'] or article['given_title'] or url
+
+    return Link(
+        url=url,
+        timestamp=article['time_read'],
+        title=title,
+        tags=article.get('tags'),
+        sources=sources
+    )
+
+def write_since(username: str, since: str):
+    from ..system import atomic_write
+
+    if not _API_DB_PATH.exists():
+        atomic_write(_API_DB_PATH, '')
+
+    since_file = ConfigParser()
+    since_file.optionxform = str
+    since_file.read(_API_DB_PATH)
+
+    since_file[username] = {
+        'since': since
+    }
+
+    with open(_API_DB_PATH, 'w+') as new:
+        since_file.write(new)
+
+def read_since(username: str) -> Optional[str]:
+    from ..system import atomic_write
+
+    if not _API_DB_PATH.exists():
+        atomic_write(_API_DB_PATH, '')
+
+    config_file = ConfigParser()
+    config_file.optionxform = str
+    config_file.read(_API_DB_PATH)
+
+    return config_file.get(username, 'since', fallback=None)
+
+@enforce_types
+def should_parse_as_pocket_api(text: str) -> bool:
+    return text.startswith('pocket://')
+
+@enforce_types
+def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
+    """Parse bookmarks from the Pocket API"""
+
+    input_buffer.seek(0)
+    pattern = re.compile("^pocket:\/\/(\w+)")
+    for line in input_buffer:
+      if should_parse_as_pocket_api(line):
+        from ..config import (
+          POCKET_CONSUMER_KEY,
+          POCKET_ACCESS_TOKENS,
+        )
+        username = pattern.search(line).group(1)
+        api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
+        api.last_since = None
+
+        for article in get_pocket_articles(api, since=read_since(username)):
+          yield link_from_article(article, sources=[line])
+
+        write_since(username, api.last_since)

+ 1 - 0
setup.py

@@ -59,6 +59,7 @@ setuptools.setup(
         "python-crontab==2.5.1",
         "croniter==0.3.34",
         "w3lib==1.22.0",
+        "pocket==0.3.6",
         # Some/all of these will likely be added in the future:
         # wpull
         # pywb