diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-03-23 17:55:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-03-23 18:38:56 -0700 |
commit | 515b5ecc6e75aae834958d74883426230532f10d (patch) | |
tree | 1d5e00696bf5272e7bda7c4ff275d9e8acaaa479 /python/tests/import_ingest.py | |
parent | ea6ccd227e0f62f5f9e7a66ba8bc90b18a2ca097 (diff) | |
download | fatcat-515b5ecc6e75aae834958d74883426230532f10d.tar.gz fatcat-515b5ecc6e75aae834958d74883426230532f10d.zip |
ingest fileset fixes, and some test coverage
Diffstat (limited to 'python/tests/import_ingest.py')
-rw-r--r-- | python/tests/import_ingest.py | 63 |
1 files changed, 60 insertions, 3 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index d9e7d294..d86d6d7b 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -5,6 +5,7 @@ from fixtures import * from fatcat_tools.importers import ( IngestFileResultImporter, + IngestFilesetResultImporter, IngestWebResultImporter, JsonLinePusher, ) @@ -20,6 +21,11 @@ def ingest_web_importer(api): yield IngestWebResultImporter(api) +@pytest.fixture(scope="function") +def ingest_fileset_importer(api): + yield IngestFilesetResultImporter(api) + + # TODO: use API to check that entities actually created... def test_ingest_importer_basic(ingest_importer): with open("tests/files/example_ingest.json", "r") as f: @@ -58,7 +64,6 @@ def test_ingest_importer_xml(ingest_importer): with open("tests/files/example_ingest_xml.json", "r") as f: ingest_importer.bezerk_mode = True counts = JsonLinePusher(ingest_importer, f).run() - print(counts) assert counts["insert"] == 1 assert counts["exists"] == 0 assert counts["skip"] == 0 @@ -86,7 +91,6 @@ def test_ingest_importer_web(ingest_web_importer): with open("tests/files/example_ingest_html.json", "r") as f: ingest_web_importer.bezerk_mode = True counts = JsonLinePusher(ingest_web_importer, f).run() - print(counts) assert counts["insert"] == 1 assert counts["exists"] == 0 assert counts["skip"] == 0 @@ -139,7 +143,6 @@ def test_ingest_importer_stage(ingest_importer, api): ingest_importer.reset() ingest_importer.push_record(raw) counts = ingest_importer.finish() - print(counts) assert counts["total"] == 1 assert counts[row["status"]] == 1 @@ -182,3 +185,57 @@ def test_ingest_dict_parse_old(ingest_importer): if u.rel == "webarchive": assert u.url.startswith("https://web.archive.org/") assert len(f.release_ids) == 1 + + +def test_ingest_fileset_dict_parse(ingest_fileset_importer): + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + raw = json.loads(f.readline()) + fs = ingest_fileset_importer.parse_record(raw) + assert len(fs.manifest) == 3 + assert fs.manifest[0].sha1 == "c0669e84e7b9052cc0f342e8ce7d31d59956326a" + assert fs.manifest[0].md5 == "caf4d9fc2c6ebd0d9251ac84e0b6b006" + assert fs.manifest[0].extra["mimetype"] == "application/x-hdf" + assert fs.manifest[0].size == 16799750 + assert fs.manifest[0].path == "N2 on food R_2010_03_25__10_53_27___4___1_features.hdf5" + assert ( + fs.manifest[0].extra["original_url"] + == "https://zenodo.org/api/files/563203f6-6de5-46d9-b305-ba42604f2508/N2%20on%20food%20R_2010_03_25__10_53_27___4___1_features.hdf5" + ) + assert len(fs.urls) == 2 + for u in fs.urls: + if u.rel == "web": + assert u.url == "https://zenodo.org/record/1028059" + if u.rel == "archive-base": + assert u.url == "https://archive.org/download/zenodo.org-1028059/" + assert len(fs.release_ids) == 1 + + +def test_ingest_fileset_importer(ingest_fileset_importer): + last_index = ingest_fileset_importer.api.get_changelog(limit=1)[0].index + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + ingest_fileset_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_fileset_importer, f).run() + assert counts["insert"] == 7 + assert counts["exists"] == 0 + assert counts["skip"] == 13 + assert counts["skip-release-not-found"] == 13 + + # fetch most recent editgroup + change = ingest_fileset_importer.api.get_changelog_entry(index=last_index + 1) + eg = change.editgroup + assert eg.description + assert "filesets crawled from web" in eg.description.lower() + assert eg.extra["git_rev"] + assert "fatcat_tools.IngestFilesetResultImporter" in eg.extra["agent"] + + # re-insert; should skip + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + ingest_fileset_importer.reset() + ingest_fileset_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_fileset_importer, f).run() + + assert counts["insert"] == 0 + assert counts["exists"] == 0 + assert counts["skip"] == 20 + assert counts["skip-release-not-found"] == 13 + assert counts["skip-release-has-fileset"] == 7 |