pocket_api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. __package__ = 'archivebox.parsers'
  2. import re
  3. from typing import IO, Iterable, Optional
  4. from datetime import datetime
  5. from configparser import ConfigParser
  6. from pathlib import Path
  7. from pocket import Pocket
  8. import requests
  9. from ..index.schema import Link
  10. from ..util import (
  11. enforce_types,
  12. )
  13. from ..config import (
  14. SOURCES_DIR
  15. )
  16. _COUNT_PER_PAGE = 500
  17. _API_DB_PATH = Path(SOURCES_DIR) / 'pocket_api.db'
  18. # search for broken protocols that sometimes come from the Pocket API
  19. _BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
  20. def get_pocket_articles(api: Pocket, since=None, page=0):
  21. body, headers = api.get(
  22. state='archive',
  23. sort='oldest',
  24. since=since,
  25. count=_COUNT_PER_PAGE,
  26. offset=page * _COUNT_PER_PAGE,
  27. )
  28. articles = body['list'].values() if isinstance(body['list'], dict) else body['list']
  29. returned_count = len(articles)
  30. yield from articles
  31. if returned_count == _COUNT_PER_PAGE:
  32. yield from get_pocket_articles(api, since=since, page=page + 1)
  33. else:
  34. api.last_since = body['since']
  35. def link_from_article(article: dict, sources: list):
  36. url: str = article['resolved_url'] or article['given_url']
  37. broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
  38. if broken_protocol:
  39. url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
  40. title = article['resolved_title'] or article['given_title'] or url
  41. return Link(
  42. url=url,
  43. timestamp=article['time_read'],
  44. title=title,
  45. tags=article.get('tags'),
  46. sources=sources
  47. )
  48. def write_since(username: str, since: str):
  49. from ..system import atomic_write
  50. if not _API_DB_PATH.exists():
  51. atomic_write(_API_DB_PATH, '')
  52. since_file = ConfigParser()
  53. since_file.optionxform = str
  54. since_file.read(_API_DB_PATH)
  55. since_file[username] = {
  56. 'since': since
  57. }
  58. with open(_API_DB_PATH, 'w+') as new:
  59. since_file.write(new)
  60. def read_since(username: str) -> Optional[str]:
  61. from ..system import atomic_write
  62. if not _API_DB_PATH.exists():
  63. atomic_write(_API_DB_PATH, '')
  64. config_file = ConfigParser()
  65. config_file.optionxform = str
  66. config_file.read(_API_DB_PATH)
  67. return config_file.get(username, 'since', fallback=None)
  68. @enforce_types
  69. def should_parse_as_pocket_api(text: str) -> bool:
  70. return text.startswith('pocket://')
  71. @enforce_types
  72. def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
  73. """Parse bookmarks from the Pocket API"""
  74. input_buffer.seek(0)
  75. pattern = re.compile("^pocket:\/\/(\w+)")
  76. for line in input_buffer:
  77. if should_parse_as_pocket_api(line):
  78. from ..config import (
  79. POCKET_CONSUMER_KEY,
  80. POCKET_ACCESS_TOKENS,
  81. )
  82. username = pattern.search(line).group(1)
  83. api = Pocket(POCKET_CONSUMER_KEY, POCKET_ACCESS_TOKENS[username])
  84. api.last_since = None
  85. for article in get_pocket_articles(api, since=read_since(username)):
  86. yield link_from_article(article, sources=[line])
  87. write_since(username, api.last_since)