|
|
@@ -0,0 +1,115 @@
|
|
|
+__package__ = 'archivebox.parsers'
|
|
|
+
|
|
|
+
|
|
|
+import re
|
|
|
+
|
|
|
+from typing import IO, Iterable, Optional
|
|
|
+from datetime import datetime
|
|
|
+from configparser import ConfigParser
|
|
|
+
|
|
|
+from pathlib import Path
|
|
|
+from pocket import Pocket
|
|
|
+import requests
|
|
|
+
|
|
|
+from ..index.schema import Link
|
|
|
+from ..util import (
|
|
|
+ enforce_types,
|
|
|
+)
|
|
|
+from ..config import (
|
|
|
+ SOURCES_DIR
|
|
|
+)
|
|
|
+
|
|
|
+_COUNT_PER_PAGE = 500
|
|
|
+_API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
|
|
|
+
|
|
|
+# search for broken protocols that sometimes come from the Pocket API
|
|
|
+_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
|
|
+
|
|
|
+def get_pocket_articles(api: Pocket, since=None, page=0):
|
|
|
+ body, headers = api.get(
|
|
|
+ state='archive',
|
|
|
+ sort='oldest',
|
|
|
+ since=since,
|
|
|
+ count=_COUNT_PER_PAGE,
|
|
|
+ offset=page * _COUNT_PER_PAGE,
|
|
|
+ )
|
|
|
+
|
|
|
+ articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
|
|
|
+ returned_count = len(articles)
|
|
|
+
|
|
|
+ yield from articles
|
|
|
+
|
|
|
+ if returned_count == _COUNT_PER_PAGE:
|
|
|
+ yield from get_pocket_articles(api, since=since, page=page + 1)
|
|
|
+ else:
|
|
|
+ api.last_since = body['since']
|
|
|
+
|
|
|
+
|
|
|
+def link_from_article(article: dict, sources: list):
|
|
|
+ url: str = article['resolved_url'] or article['given_url']
|
|
|
+ broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
|
|
|
+ if broken_protocol:
|
|
|
+ url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
|
|
|
+ title = article['resolved_title'] or article['given_title'] or url
|
|
|
+
|
|
|
+ return Link(
|
|
|
+ url=url,
|
|
|
+ timestamp=article['time_read'],
|
|
|
+ title=title,
|
|
|
+ tags=article.get('tags'),
|
|
|
+ sources=sources
|
|
|
+ )
|
|
|
+
|
|
|
+def write_since(username: str, since: str):
|
|
|
+ from ..system import atomic_write
|
|
|
+
|
|
|
+ if not _API_DB_PATH.exists():
|
|
|
+ atomic_write(_API_DB_PATH, '')
|
|
|
+
|
|
|
+ since_file = ConfigParser()
|
|
|
+ since_file.optionxform = str
|
|
|
+ since_file.read(_API_DB_PATH)
|
|
|
+
|
|
|
+ since_file[username] = {
|
|
|
+ 'since': since
|
|
|
+ }
|
|
|
+
|
|
|
+ with open(_API_DB_PATH, 'w+') as new:
|
|
|
+ since_file.write(new)
|
|
|
+
|
|
|
+def read_since(username: str) -> Optional[str]:
|
|
|
+ from ..system import atomic_write
|
|
|
+
|
|
|
+ if not _API_DB_PATH.exists():
|
|
|
+ atomic_write(_API_DB_PATH, '')
|
|
|
+
|
|
|
+ config_file = ConfigParser()
|
|
|
+ config_file.optionxform = str
|
|
|
+ config_file.read(_API_DB_PATH)
|
|
|
+
|
|
|
+ return config_file.get(username, 'since', fallback=None)
|
|
|
+
|
|
|
+@enforce_types
|
|
|
+def should_parse_as_pocket_api(text: str) -> bool:
|
|
|
+ return text.startswith('pocket://')
|
|
|
+
|
|
|
+@enforce_types
|
|
|
+def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
|
|
+ """Parse bookmarks from the Pocket API"""
|
|
|
+
|
|
|
+ input_buffer.seek(0)
|
|
|
+ pattern = re.compile("^pocket:\/\/(\w+)")
|
|
|
+ for line in input_buffer:
|
|
|
+ if should_parse_as_pocket_api(line):
|
|
|
+ from ..config import (
|
|
|
+ POCKET_CONSUMER_KEY,
|
|
|
+ POCKET_ACCESS_TOKENS,
|
|
|
+ )
|
|
|
+ username = pattern.search(line).group(1)
|
|
|
+ api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
|
|
|
+ api.last_since = None
|
|
|
+
|
|
|
+ for article in get_pocket_articles(api, since=read_since(username)):
|
|
|
+ yield link_from_article(article, sources=[line])
|
|
|
+
|
|
|
+ write_since(username, api.last_since)
|