generic_json.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. __package__ = 'archivebox.parsers'
  2. import json
  3. from typing import IO, Iterable
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from archivebox.misc.util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. # This gets used by generic_jsonl, too
  11. def jsonObjectToLink(link: str, source: str):
  12. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  13. # example line
  14. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  15. # Parse URL
  16. url = link.get('href') or link.get('url') or link.get('URL')
  17. if not url:
  18. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  19. # Parse the timestamp
  20. ts_str = str(datetime.now(timezone.utc).timestamp())
  21. if link.get('timestamp'):
  22. # chrome/ff histories use a very precise timestamp
  23. ts_str = str(link['timestamp'] / 1000000)
  24. elif link.get('time'):
  25. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  26. elif link.get('created_at'):
  27. ts_str = str(json_date(link['created_at']).timestamp())
  28. elif link.get('created'):
  29. ts_str = str(json_date(link['created']).timestamp())
  30. elif link.get('date'):
  31. ts_str = str(json_date(link['date']).timestamp())
  32. elif link.get('bookmarked'):
  33. ts_str = str(json_date(link['bookmarked']).timestamp())
  34. elif link.get('saved'):
  35. ts_str = str(json_date(link['saved']).timestamp())
  36. # Parse the title
  37. title = None
  38. if link.get('title'):
  39. title = link['title'].strip()
  40. elif link.get('description'):
  41. title = link['description'].replace(' — Readability', '').strip()
  42. elif link.get('name'):
  43. title = link['name'].strip()
  44. # if we have a list, join it with commas
  45. tags = link.get('tags')
  46. if type(tags) == list:
  47. tags = ','.join(tags)
  48. elif type(tags) == str:
  49. # if there's no comma, assume it was space-separated
  50. if ',' not in tags:
  51. tags = tags.replace(' ', ',')
  52. return Link(
  53. url=htmldecode(url),
  54. timestamp=ts_str,
  55. title=htmldecode(title) or None,
  56. tags=htmldecode(tags),
  57. sources=[source],
  58. )
  59. @enforce_types
  60. def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
  61. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  62. json_file.seek(0)
  63. links = json.load(json_file)
  64. if type(links) != list:
  65. raise Exception('JSON parser expects list of objects, maybe this is JSONL?')
  66. for link in links:
  67. if link:
  68. yield jsonObjectToLink(link, json_file.name)
  69. KEY = 'json'
  70. NAME = 'Generic JSON'
  71. PARSER = parse_generic_json_export