Pārlūkot izejas kodu

Fix JSON parser by not always mangling the input

Rather than by assuming the JSON file we are parsing has junk at the beginning
(which maybe only used to happen?), try parsing it as-is first, and then fall
back to trying again after skipping the first line

Fixes #1347
jim winstead 1 gadu atpakaļ
vecāks
revīzija
178e676e0f

+ 15 - 4
archivebox/parsers/generic_json.py

@@ -18,9 +18,16 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
 
     json_file.seek(0)
 
-    # sometimes the first line is a comment or filepath, so we get everything after the first {
-    json_file_json_str = '{' + json_file.read().split('{', 1)[-1]
-    links = json.loads(json_file_json_str)
+    try:
+        links = json.load(json_file)
+    except json.decoder.JSONDecodeError:
+        # sometimes the first line is a comment or other junk, so try without
+        json_file.seek(0)
+        first_line = json_file.readline()
+        #print('      > Trying JSON parser without first line: "', first_line.strip(), '"', sep= '')
+        links = json.load(json_file)
+        # we may fail again, which means we really don't know what to do
+
     json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
 
     for link in links:
@@ -59,11 +66,15 @@ def parse_generic_json_export(json_file: IO[str], **_kwargs) -> Iterable[Link]:
             elif link.get('name'):
                 title = link['name'].strip()
 
+            tags = ''
+            if link.get('tags'):
+                tags = link.get('tags').replace(' ',',')
+
             yield Link(
                 url=htmldecode(url),
                 timestamp=ts_str,
                 title=htmldecode(title) or None,
-                tags=htmldecode(link.get('tags')) or '',
+                tags=htmldecode(tags),
                 sources=[json_file.name],
             )
 

+ 1 - 0
tests/mock_server/templates/example.json

@@ -0,0 +1 @@
+[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]

+ 2 - 0
tests/mock_server/templates/example.json.bad

@@ -0,0 +1,2 @@
+this line would cause problems but --parser=json will actually skip it
+[{"href":"http://127.0.0.1:8080/static/example.com.html","description":"Example","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"Tag1 Tag2","trap":"http://www.example.com/should-not-exist"}]

+ 50 - 0
tests/test_add.py

@@ -91,3 +91,53 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
 
     assert (archived_item_path / "warc").exists()
     assert not (archived_item_path / "singlefile.html").exists()
+
+def test_json(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.json', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=json"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://www.example.com/should-not-exist" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags
+
+def test_json_with_leading_garbage(tmp_path, process, disable_extractors_dict):
+    with open('../../mock_server/templates/example.json.bad', 'r', encoding='utf-8') as f:
+        arg_process = subprocess.run(
+            ["archivebox", "add", "--index-only", "--parser=json"],
+            stdin=f,
+            capture_output=True,
+            env=disable_extractors_dict,
+        )
+
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    urls = c.execute("SELECT url from core_snapshot").fetchall()
+    tags = c.execute("SELECT name from core_tag").fetchall()
+    conn.commit()
+    conn.close()
+
+    urls = list(map(lambda x: x[0], urls))
+    assert "http://127.0.0.1:8080/static/example.com.html" in urls
+    # if the following URL appears, we must have fallen back to another parser
+    assert not "http://www.example.com/should-not-exist" in urls
+
+    tags = list(map(lambda x: x[0], tags))
+    assert "Tag1" in tags
+    assert "Tag2" in tags