pocket_api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable, Optional
  4. from configparser import ConfigParser
  5. from pathlib import Path
  6. from ..vendor.pocket import Pocket
  7. from ..index.schema import Link
  8. from ..util import enforce_types
  9. from ..system import atomic_write
  10. from ..config import (
  11. SOURCES_DIR,
  12. POCKET_CONSUMER_KEY,
  13. POCKET_ACCESS_TOKENS,
  14. )
  15. COUNT_PER_PAGE = 500
  16. API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
  17. # search for broken protocols that sometimes come from the Pocket API
  18. _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
  19. def get_pocket_articles(api: Pocket, since=None, page=0):
  20. body, headers = api.get(
  21. state='archive',
  22. sort='oldest',
  23. since=since,
  24. count=COUNT_PER_PAGE,
  25. offset=page * COUNT_PER_PAGE,
  26. )
  27. articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
  28. returned_count = len(articles)
  29. yield from articles
  30. if returned_count == COUNT_PER_PAGE:
  31. yield from get_pocket_articles(api, since=since, page=page + 1)
  32. else:
  33. api.last_since = body['since']
  34. def link_from_article(article: dict, sources: list):
  35. url: str = article['resolved_url'] or article['given_url']
  36. broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
  37. if broken_protocol:
  38. url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
  39. title = article['resolved_title'] or article['given_title'] or url
  40. return Link(
  41. url=url,
  42. timestamp=article['time_read'],
  43. title=title,
  44. tags=article.get('tags'),
  45. sources=sources
  46. )
  47. def write_since(username: str, since: str):
  48. if not API_DB_PATH.exists():
  49. atomic_write(API_DB_PATH, '')
  50. since_file = ConfigParser()
  51. since_file.optionxform = str
  52. since_file.read(API_DB_PATH)
  53. since_file[username] = {
  54. 'since': since
  55. }
  56. with open(API_DB_PATH, 'w+') as new:
  57. since_file.write(new)
  58. def read_since(username: str) -> Optional[str]:
  59. if not API_DB_PATH.exists():
  60. atomic_write(API_DB_PATH, '')
  61. config_file = ConfigParser()
  62. config_file.optionxform = str
  63. config_file.read(API_DB_PATH)
  64. return config_file.get(username, 'since', fallback=None)
  65. @enforce_types
  66. def should_parse_as_pocket_api(text: str) -> bool:
  67. return text.startswith('pocket://')
  68. @enforce_types
  69. def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
  70. """Parse bookmarks from the Pocket API"""
  71. input_buffer.seek(0)
  72. pattern = re.compile(r"^pocket:\/\/(\w+)")
  73. for line in input_buffer:
  74. if should_parse_as_pocket_api(line):
  75. username = pattern.search(line).group(1)
  76. api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
  77. api.last_since = None
  78. for article in get_pocket_articles(api, since=read_since(username)):
  79. yield link_from_article(article, sources=[line])
  80. write_since(username, api.last_since)
  81. KEY = 'pocket_api'
  82. NAME = 'Pocket API'
  83. PARSER = parse_pocket_api_export