pocket_api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable, Optional
  4. from configparser import ConfigParser
  5. import archivebox
  6. from archivebox.config import CONSTANTS
  7. from archivebox.misc.util import enforce_types
  8. from archivebox.misc.system import atomic_write
  9. from ..index.schema import Link
  10. COUNT_PER_PAGE = 500
  11. API_DB_PATH = CONSTANTS.SOURCES_DIR / 'pocket_api.db'
  12. # search for broken protocols that sometimes come from the Pocket API
  13. _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
  14. def get_pocket_articles(api, since=None, page=0):
  15. body, headers = api.get(
  16. state='archive',
  17. sort='oldest',
  18. since=since,
  19. count=COUNT_PER_PAGE,
  20. offset=page * COUNT_PER_PAGE,
  21. )
  22. articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
  23. returned_count = len(articles)
  24. yield from articles
  25. if returned_count == COUNT_PER_PAGE:
  26. yield from get_pocket_articles(api, since=since, page=page + 1)
  27. else:
  28. api.last_since = body['since']
  29. def link_from_article(article: dict, sources: list):
  30. url: str = article.get('resolved_url') or article['given_url']
  31. broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
  32. if broken_protocol:
  33. url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
  34. title = article.get('resolved_title') or article.get('given_title') or url
  35. return Link(
  36. url=url,
  37. timestamp=article['time_read'],
  38. title=title,
  39. tags=article.get('tags'),
  40. sources=sources
  41. )
  42. def write_since(username: str, since: str):
  43. if not API_DB_PATH.exists():
  44. atomic_write(API_DB_PATH, '')
  45. since_file = ConfigParser()
  46. since_file.optionxform = str
  47. since_file.read(API_DB_PATH)
  48. since_file[username] = {
  49. 'since': since
  50. }
  51. with open(API_DB_PATH, 'w+') as new:
  52. since_file.write(new)
  53. def read_since(username: str) -> Optional[str]:
  54. if not API_DB_PATH.exists():
  55. atomic_write(API_DB_PATH, '')
  56. config_file = ConfigParser()
  57. config_file.optionxform = str
  58. config_file.read(API_DB_PATH)
  59. return config_file.get(username, 'since', fallback=None)
  60. @enforce_types
  61. def should_parse_as_pocket_api(text: str) -> bool:
  62. return text.startswith('pocket://')
  63. @enforce_types
  64. def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
  65. """Parse bookmarks from the Pocket API"""
  66. from pocket import Pocket
  67. FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
  68. input_buffer.seek(0)
  69. pattern = re.compile(r"^pocket:\/\/(\w+)")
  70. for line in input_buffer:
  71. if should_parse_as_pocket_api(line):
  72. username = pattern.search(line).group(1)
  73. api = Pocket(FLAT_CONFIG.POCKET_CONSUMER_KEY, FLAT_CONFIG.POCKET_ACCESS_TOKENS[username])
  74. api.last_since = None
  75. for article in get_pocket_articles(api, since=read_since(username)):
  76. yield link_from_article(article, sources=[line])
  77. write_since(username, api.last_since)
  78. KEY = 'pocket_api'
  79. NAME = 'Pocket API'
  80. PARSER = parse_pocket_api_export