generic_json.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. __package__ = 'archivebox.parsers'
  2. import json
  3. from typing import IO, Iterable
  4. from datetime import datetime, timezone
  5. from ..index.schema import Link
  6. from ..util import (
  7. htmldecode,
  8. enforce_types,
  9. )
  10. @enforce_types
  11. def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
  12. """Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
  13. json_file.seek(0)
  14. try:
  15. links = json.load(json_file)
  16. except json.decoder.JSONDecodeError:
  17. # sometimes the first line is a comment or other junk, so try without
  18. json_file.seek(0)
  19. first_line = json_file.readline()
  20. #print(' > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
  21. links = json.load(json_file)
  22. # we may fail again, which means we really don't know what to do
  23. json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
  24. for link in links:
  25. # example line
  26. # {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
  27. if link:
  28. # Parse URL
  29. url = link.get('href') or link.get('url') or link.get('URL')
  30. if not url:
  31. raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
  32. # Parse the timestamp
  33. ts_str = str(datetime.now(timezone.utc).timestamp())
  34. if link.get('timestamp'):
  35. # chrome/ff histories use a very precise timestamp
  36. ts_str = str(link['timestamp'] / 10000000)
  37. elif link.get('time'):
  38. ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
  39. elif link.get('created_at'):
  40. ts_str = str(json_date(link['created_at']).timestamp())
  41. elif link.get('created'):
  42. ts_str = str(json_date(link['created']).timestamp())
  43. elif link.get('date'):
  44. ts_str = str(json_date(link['date']).timestamp())
  45. elif link.get('bookmarked'):
  46. ts_str = str(json_date(link['bookmarked']).timestamp())
  47. elif link.get('saved'):
  48. ts_str = str(json_date(link['saved']).timestamp())
  49. # Parse the title
  50. title = None
  51. if link.get('title'):
  52. title = link['title'].strip()
  53. elif link.get('description'):
  54. title = link['description'].replace(' — Readability', '').strip()
  55. elif link.get('name'):
  56. title = link['name'].strip()
  57. # if we have a list, join it with commas
  58. tags = link.get('tags')
  59. if type(tags) == list:
  60. tags = ','.join(tags)
  61. elif type(tags) == str:
  62. # if there's no comma, assume it was space-separated
  63. if ',' not in tags:
  64. tags = tags.replace(' ', ',')
  65. yield Link(
  66. url=htmldecode(url),
  67. timestamp=ts_str,
  68. title=htmldecode(title) or None,
  69. tags=htmldecode(tags),
  70. sources=[json_file.name],
  71. )
  72. KEY = 'json'
  73. NAME = 'Generic JSON'
  74. PARSER = parse_generic_json_export