generic_json.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. __package__ = 'archivebox.parsers'
  2. import json
  3. from typing import IO, Iterable
  4. from datetime import datetime
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  13. json_file.seek(0)
  14. links = json.load(json_file)
  15. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  16. for link in links:
  17. # example line
  18. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  19. if link:
  20. # Parse URL
  21. url = link.get('href') or link.get('url') or link.get('URL')
  22. if not url:
  23. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  24. # Parse the timestamp
  25. ts_str = str(datetime.now().timestamp())
  26. if link.get('timestamp'):
  27. # chrome/ff histories use a very precise timestamp
  28. ts_str = str(link['timestamp'] / 10000000)
  29. elif link.get('time'):
  30. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  31. elif link.get('created_at'):
  32. ts_str = str(json_date(link['created_at']).timestamp())
  33. elif link.get('created'):
  34. ts_str = str(json_date(link['created']).timestamp())
  35. elif link.get('date'):
  36. ts_str = str(json_date(link['date']).timestamp())
  37. elif link.get('bookmarked'):
  38. ts_str = str(json_date(link['bookmarked']).timestamp())
  39. elif link.get('saved'):
  40. ts_str = str(json_date(link['saved']).timestamp())
  41. # Parse the title
  42. title = None
  43. if link.get('title'):
  44. title = link['title'].strip()
  45. elif link.get('description'):
  46. title = link['description'].replace(' — Readability', '').strip()
  47. elif link.get('name'):
  48. title = link['name'].strip()
  49. yield Link(
  50. url=htmldecode(url),
  51. timestamp=ts_str,
  52. title=htmldecode(title) or None,
  53. tags=htmldecode(link.get('tags')) or '',
  54. sources=[json_file.name],
  55. )