pocket_api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. import archivebox
  4. from typing import IO, Iterable, Optional
  5. from configparser import ConfigParser
  6. from pocket import Pocket
  7. from ..index.schema import Link
  8. from ..util import enforce_types
  9. from ..system import atomic_write
  10. from ..config import (
  11. POCKET_CONSUMER_KEY,
  12. POCKET_ACCESS_TOKENS,
  13. )
  14. COUNT_PER_PAGE = 500
  15. API_DB_PATH = archivebox.DATA_DIR / 'sources' / 'pocket_api.db'
  16. # search for broken protocols that sometimes come from the Pocket API
  17. _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
  18. def get_pocket_articles(api: Pocket, since=None, page=0):
  19. body, headers = api.get(
  20. state='archive',
  21. sort='oldest',
  22. since=since,
  23. count=COUNT_PER_PAGE,
  24. offset=page * COUNT_PER_PAGE,
  25. )
  26. articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
  27. returned_count = len(articles)
  28. yield from articles
  29. if returned_count == COUNT_PER_PAGE:
  30. yield from get_pocket_articles(api, since=since, page=page + 1)
  31. else:
  32. api.last_since = body['since']
  33. def link_from_article(article: dict, sources: list):
  34. url: str = article.get('resolved_url') or article['given_url']
  35. broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
  36. if broken_protocol:
  37. url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
  38. title = article.get('resolved_title') or article.get('given_title') or url
  39. return Link(
  40. url=url,
  41. timestamp=article['time_read'],
  42. title=title,
  43. tags=article.get('tags'),
  44. sources=sources
  45. )
  46. def write_since(username: str, since: str):
  47. if not API_DB_PATH.exists():
  48. atomic_write(API_DB_PATH, '')
  49. since_file = ConfigParser()
  50. since_file.optionxform = str
  51. since_file.read(API_DB_PATH)
  52. since_file[username] = {
  53. 'since': since
  54. }
  55. with open(API_DB_PATH, 'w+') as new:
  56. since_file.write(new)
  57. def read_since(username: str) -> Optional[str]:
  58. if not API_DB_PATH.exists():
  59. atomic_write(API_DB_PATH, '')
  60. config_file = ConfigParser()
  61. config_file.optionxform = str
  62. config_file.read(API_DB_PATH)
  63. return config_file.get(username, 'since', fallback=None)
  64. @enforce_types
  65. def should_parse_as_pocket_api(text: str) -> bool:
  66. return text.startswith('pocket://')
  67. @enforce_types
  68. def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
  69. """Parse bookmarks from the Pocket API"""
  70. input_buffer.seek(0)
  71. pattern = re.compile(r"^pocket:\/\/(\w+)")
  72. for line in input_buffer:
  73. if should_parse_as_pocket_api(line):
  74. username = pattern.search(line).group(1)
  75. api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
  76. api.last_since = None
  77. for article in get_pocket_articles(api, since=read_since(username)):
  78. yield link_from_article(article, sources=[line])
  79. write_since(username, api.last_since)
  80. KEY = 'pocket_api'
  81. NAME = 'Pocket API'
  82. PARSER = parse_pocket_api_export