readwise_reader_api.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. __package__ = "archivebox.parsers"
  2. import re
  3. import requests
  4. from datetime import datetime
  5. from typing import IO, Iterable, Optional
  6. from configparser import ConfigParser
  7. from pathlib import Path
  8. from ..index.schema import Link
  9. from ..util import enforce_types
  10. from ..system import atomic_write
  11. from ..config import (
  12. SOURCES_DIR,
  13. READWISE_READER_TOKENS,
  14. )
  15. API_DB_PATH = Path(SOURCES_DIR) / "readwise_reader_api.db"
  16. class ReadwiseReaderAPI:
  17. cursor: Optional[str]
  18. def __init__(self, api_token, cursor=None) -> None:
  19. self.api_token = api_token
  20. self.cursor = cursor
  21. def get_archive(self):
  22. response = requests.get(
  23. url="https://readwise.io/api/v3/list/",
  24. headers={"Authorization": "Token s71gNtiNDWquEvlJFFUyDU10ao8fn99lGyNryvyllQcDSnrd7X"},
  25. params={
  26. "location": "archive",
  27. "pageCursor": self.cursor,
  28. }
  29. )
  30. response.raise_for_status()
  31. return response
  32. def get_readwise_reader_articles(api: ReadwiseReaderAPI):
  33. response = api.get_archive()
  34. body = response.json()
  35. articles = body["results"]
  36. yield from articles
  37. if body['nextPageCursor']:
  38. api.cursor = body["nextPageCursor"]
  39. yield from get_readwise_reader_articles(api)
  40. def link_from_article(article: dict, sources: list):
  41. url: str = article['source_url']
  42. title = article["title"] or url
  43. timestamp = datetime.fromisoformat(article['updated_at']).timestamp()
  44. return Link(
  45. url=url,
  46. timestamp=str(timestamp),
  47. title=title,
  48. tags="",
  49. sources=sources,
  50. )
  51. def write_cursor(username: str, since: str):
  52. if not API_DB_PATH.exists():
  53. atomic_write(API_DB_PATH, "")
  54. since_file = ConfigParser()
  55. since_file.optionxform = str
  56. since_file.read(API_DB_PATH)
  57. since_file[username] = {"since": since}
  58. with open(API_DB_PATH, "w+") as new:
  59. since_file.write(new)
  60. def read_cursor(username: str) -> Optional[str]:
  61. if not API_DB_PATH.exists():
  62. atomic_write(API_DB_PATH, "")
  63. config_file = ConfigParser()
  64. config_file.optionxform = str
  65. config_file.read(API_DB_PATH)
  66. return config_file.get(username, "since", fallback=None)
  67. @enforce_types
  68. def should_parse_as_readwise_reader_api(text: str) -> bool:
  69. return text.startswith("readwise-reader://")
  70. @enforce_types
  71. def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
  72. """Parse bookmarks from the Readwise Reader API"""
  73. input_buffer.seek(0)
  74. pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
  75. for line in input_buffer:
  76. if should_parse_as_readwise_reader_api(line):
  77. username = pattern.search(line).group(1)
  78. api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
  79. for article in get_readwise_reader_articles(api):
  80. yield link_from_article(article, sources=[line])
  81. if api.cursor:
  82. write_cursor(username, api.cursor)
  83. KEY = "readwise_reader_api"
  84. NAME = "Readwise Reader API"
  85. PARSER = parse_readwise_reader_api_export