generic_json.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. __package__ = 'archivebox.parsers'
  2. import json
  3. from typing import IO, Iterable
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  13. json_file.seek(0)
  14. # sometimes the first line is a comment or filepath, so we get everything after the first {
  15. json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
  16. links = json.loads(json_file_json_str)
  17. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  18. for link in links:
  19. # example line
  20. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  21. if link:
  22. # Parse URL
  23. url = link.get('href') or link.get('url') or link.get('URL')
  24. if not url:
  25. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  26. # Parse the timestamp
  27. ts_str = str(datetime.now(timezone.utc).timestamp())
  28. if link.get('timestamp'):
  29. # chrome/ff histories use a very precise timestamp
  30. ts_str = str(link['timestamp'] / 10000000)
  31. elif link.get('time'):
  32. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  33. elif link.get('created_at'):
  34. ts_str = str(json_date(link['created_at']).timestamp())
  35. elif link.get('created'):
  36. ts_str = str(json_date(link['created']).timestamp())
  37. elif link.get('date'):
  38. ts_str = str(json_date(link['date']).timestamp())
  39. elif link.get('bookmarked'):
  40. ts_str = str(json_date(link['bookmarked']).timestamp())
  41. elif link.get('saved'):
  42. ts_str = str(json_date(link['saved']).timestamp())
  43. # Parse the title
  44. title = None
  45. if link.get('title'):
  46. title = link['title'].strip()
  47. elif link.get('description'):
  48. title = link['description'].replace(' — Readability', '').strip()
  49. elif link.get('name'):
  50. title = link['name'].strip()
  51. yield Link(
  52. url=htmldecode(url),
  53. timestamp=ts_str,
  54. title=htmldecode(title) or None,
  55. tags=htmldecode(link.get('tags')) or '',
  56. sources=[json_file.name],
  57. )
  58. KEY = 'json'
  59. NAME = 'Generic JSON'
  60. PARSER = parse_generic_json_export