diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-15 14:13:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-15 14:13:34 -0800 |
commit | 689da76d1c759d6368d760b4a1fa942e16095a40 (patch) | |
tree | ca78cd6841875b3c7d55d046b3c7a206e604b60f | |
parent | a02d51650bb5a3165ec89e822f43ff98807d01c3 (diff) | |
download | fatcat-689da76d1c759d6368d760b4a1fa942e16095a40.tar.gz fatcat-689da76d1c759d6368d760b4a1fa942e16095a40.zip |
ingest: improve tests, support old ingest results
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 15 | ||||
-rw-r--r-- | python/tests/files/example_ingest.json | 2 | ||||
-rw-r--r-- | python/tests/files/example_ingest.old.json | 1 | ||||
-rw-r--r-- | python/tests/import_ingest.py | 16 |
4 files changed, 30 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 3d391bd8..82a33aaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -112,9 +112,18 @@ class IngestFileResultImporter(EntityImporter): terminal = row.get('terminal') if not terminal: - # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 - return None + # support old cdx-only ingest results + cdx = row.get('cdx') + if not cdx: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + else: + terminal = { + 'terminal_url': cdx['url'], + 'terminal_dt': cdx['datetime'], + 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), + } # work around old schema if not 'terminal_url' in terminal: diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json index 005d8742..18804c1b 100644 --- a/python/tests/files/example_ingest.json +++ b/python/tests/files/example_ingest.json @@ -1 +1 @@ -{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } } +{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } } diff --git a/python/tests/files/example_ingest.old.json b/python/tests/files/example_ingest.old.json new file mode 100644 index 00000000..005d8742 --- /dev/null +++ b/python/tests/files/example_ingest.old.json @@ -0,0 +1 @@ +{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } } diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index 7c0a85cd..5089d99e 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -56,3 +56,19 @@ def test_ingest_dict_parse(ingest_importer): if u.rel == "webarchive": assert u.url.startswith("https://web.archive.org/") assert len(f.release_ids) == 1 + +def test_ingest_dict_parse_old(ingest_importer): + with open('tests/files/example_ingest.old.json', 'r') as f: + raw = json.loads(f.readline()) + f = ingest_importer.parse_record(raw) + assert f.sha1 == "00242a192acc258bdfdb151943419437f440c313" + assert f.md5 == "f4de91152c7ab9fdc2a128f962faebff" + assert f.mimetype == "application/pdf" + assert f.size == 255629 + assert len(f.urls) == 2 + for u in f.urls: + if u.rel == "web": + assert u.url.startswith("http://journals.plos.org") + if u.rel == "webarchive": + assert u.url.startswith("https://web.archive.org/") + assert len(f.release_ids) == 1 |