aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/ingest.py15
-rw-r--r--python/tests/files/example_ingest.json2
-rw-r--r--python/tests/files/example_ingest.old.json1
-rw-r--r--python/tests/import_ingest.py16
4 files changed, 30 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 3d391bd8..82a33aaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -112,9 +112,18 @@ class IngestFileResultImporter(EntityImporter):
terminal = row.get('terminal')
if not terminal:
- # TODO: support archive.org hits?
- self.counts['skip-no-terminal'] += 1
- return None
+ # support old cdx-only ingest results
+ cdx = row.get('cdx')
+ if not cdx:
+ # TODO: support archive.org hits?
+ self.counts['skip-no-terminal'] += 1
+ return None
+ else:
+ terminal = {
+ 'terminal_url': cdx['url'],
+ 'terminal_dt': cdx['datetime'],
+ 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'),
+ }
# work around old schema
if not 'terminal_url' in terminal:
diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json
index 005d8742..18804c1b 100644
--- a/python/tests/files/example_ingest.json
+++ b/python/tests/files/example_ingest.json
@@ -1 +1 @@
-{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } }
+{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } }
diff --git a/python/tests/files/example_ingest.old.json b/python/tests/files/example_ingest.old.json
new file mode 100644
index 00000000..005d8742
--- /dev/null
+++ b/python/tests/files/example_ingest.old.json
@@ -0,0 +1 @@
+{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } }
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index 7c0a85cd..5089d99e 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -56,3 +56,19 @@ def test_ingest_dict_parse(ingest_importer):
if u.rel == "webarchive":
assert u.url.startswith("https://web.archive.org/")
assert len(f.release_ids) == 1
+
+def test_ingest_dict_parse_old(ingest_importer):
+ with open('tests/files/example_ingest.old.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = ingest_importer.parse_record(raw)
+ assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313"
+ assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff"
+ assert f.mimetype == "application/pdf"
+ assert f.size == 255629
+ assert len(f.urls) == 2
+ for u in f.urls:
+ if u.rel == "web":
+ assert u.url.startswith("http://journals.plos.org")
+ if u.rel == "webarchive":
+ assert u.url.startswith("https://web.archive.org/")
+ assert len(f.release_ids) == 1