diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-04-07 14:44:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-04-07 14:44:01 -0700 |
commit | ede98644a89afd15d903061e0998dbd08851df6d (patch) | |
tree | 17c54c5764adb2f5d67aa750174f635e0fb1cdc8 /python/tests/import_ingest.py | |
parent | 2ef72e0c769e94401568ab42def30ddb5268fa98 (diff) | |
parent | 0aaa2a839d7a14716ee1a84b730203a7953dc5e0 (diff) | |
download | fatcat-ede98644a89afd15d903061e0998dbd08851df6d.tar.gz fatcat-ede98644a89afd15d903061e0998dbd08851df6d.zip |
Merge branch 'bnewbold-dataset-ingest-fixes'
Diffstat (limited to 'python/tests/import_ingest.py')
-rw-r--r-- | python/tests/import_ingest.py | 130 |
1 files changed, 127 insertions, 3 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py index d9e7d294..65549d1d 100644 --- a/python/tests/import_ingest.py +++ b/python/tests/import_ingest.py @@ -5,6 +5,8 @@ from fixtures import * from fatcat_tools.importers import ( IngestFileResultImporter, + IngestFilesetFileResultImporter, + IngestFilesetResultImporter, IngestWebResultImporter, JsonLinePusher, ) @@ -20,6 +22,16 @@ def ingest_web_importer(api): yield IngestWebResultImporter(api) +@pytest.fixture(scope="function") +def ingest_fileset_importer(api): + yield IngestFilesetResultImporter(api) + + +@pytest.fixture(scope="function") +def ingest_fileset_file_importer(api): + yield IngestFilesetFileResultImporter(api) + + # TODO: use API to check that entities actually created... def test_ingest_importer_basic(ingest_importer): with open("tests/files/example_ingest.json", "r") as f: @@ -58,7 +70,6 @@ def test_ingest_importer_xml(ingest_importer): with open("tests/files/example_ingest_xml.json", "r") as f: ingest_importer.bezerk_mode = True counts = JsonLinePusher(ingest_importer, f).run() - print(counts) assert counts["insert"] == 1 assert counts["exists"] == 0 assert counts["skip"] == 0 @@ -86,7 +97,6 @@ def test_ingest_importer_web(ingest_web_importer): with open("tests/files/example_ingest_html.json", "r") as f: ingest_web_importer.bezerk_mode = True counts = JsonLinePusher(ingest_web_importer, f).run() - print(counts) assert counts["insert"] == 1 assert counts["exists"] == 0 assert counts["skip"] == 0 @@ -139,7 +149,6 @@ def test_ingest_importer_stage(ingest_importer, api): ingest_importer.reset() ingest_importer.push_record(raw) counts = ingest_importer.finish() - print(counts) assert counts["total"] == 1 assert counts[row["status"]] == 1 @@ -182,3 +191,118 @@ def test_ingest_dict_parse_old(ingest_importer): if u.rel == "webarchive": assert u.url.startswith("https://web.archive.org/") assert len(f.release_ids) == 1 + + +def test_ingest_fileset_dict_parse(ingest_fileset_importer): + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + raw = json.loads(f.readline()) + fs = ingest_fileset_importer.parse_record(raw) + assert len(fs.manifest) == 3 + assert fs.manifest[0].sha1 == "c0669e84e7b9052cc0f342e8ce7d31d59956326a" + assert fs.manifest[0].md5 == "caf4d9fc2c6ebd0d9251ac84e0b6b006" + assert fs.manifest[0].mimetype == "application/x-hdf" + assert fs.manifest[0].size == 16799750 + assert fs.manifest[0].path == "N2 on food R_2010_03_25__10_53_27___4___1_features.hdf5" + assert ( + fs.manifest[0].extra["original_url"] + == "https://zenodo.org/api/files/563203f6-6de5-46d9-b305-ba42604f2508/N2%20on%20food%20R_2010_03_25__10_53_27___4___1_features.hdf5" + ) + assert len(fs.urls) == 2 + matched = 0 + for u in fs.urls: + if u.rel == "web": + assert u.url == "https://zenodo.org/record/1028059" + matched += 1 + if u.rel == "archive-base": + assert u.url == "https://archive.org/download/zenodo.org-1028059/" + matched += 1 + assert matched == 2 + assert len(fs.release_ids) == 1 + + +def test_ingest_fileset_importer(ingest_fileset_importer): + last_index = ingest_fileset_importer.api.get_changelog(limit=1)[0].index + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + ingest_fileset_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_fileset_importer, f).run() + assert counts["insert"] == 7 + assert counts["exists"] == 0 + assert counts["skip"] == 13 + assert counts["skip-release-not-found"] == 13 + + # fetch most recent editgroup + change = ingest_fileset_importer.api.get_changelog_entry(index=last_index + 1) + eg = change.editgroup + assert eg.description + assert "filesets crawled from web" in eg.description.lower() + assert eg.extra["git_rev"] + assert "fatcat_tools.IngestFilesetResultImporter" in eg.extra["agent"] + + # re-insert; should skip + with open("tests/files/example_fileset_ingest_result.json", "r") as f: + ingest_fileset_importer.reset() + ingest_fileset_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_fileset_importer, f).run() + + assert counts["insert"] == 0 + assert counts["exists"] == 7 + assert counts["skip"] == 13 + assert counts["skip-release-not-found"] == 13 + + +def test_ingest_fileset_file_dict_parse(ingest_fileset_file_importer): + with open("tests/files/example_fileset_file_ingest_result.json", "r") as f: + raw = json.loads(f.readline()) + fe = ingest_fileset_file_importer.parse_record(raw) + assert fe.sha1 == "6fb020064da66bb7a666c17555611cf6820fc9ae" + assert fe.md5 == "dfc41b617564f99a12e6077a6208876f" + assert fe.sha256 == "2febad53ff0f163a18d7cbb913275bf99ed2544730cda191458837e2b0da9d18" + assert fe.mimetype == "image/tiff" + assert fe.size == 410631015 + assert fe.extra["path"] == "NDVI_Diff_1990_2018_T06.tif" + assert len(fe.urls) == 2 + matched = 0 + for u in fe.urls: + if u.rel == "web": + assert u.url == "https://ndownloader.figshare.com/files/14460875" + matched += 1 + if u.rel == "archive": + assert ( + u.url + == "https://archive.org/download/springernature.figshare.com-7767695-v1/NDVI_Diff_1990_2018_T06.tif" + ) + matched += 1 + assert matched == 2 + assert len(fe.release_ids) == 1 + + +def test_ingest_fileset_file_importer(ingest_fileset_file_importer): + """ + Similar to the above, but specifically tests 'file'/'success-file' import pathway + """ + last_index = ingest_fileset_file_importer.api.get_changelog(limit=1)[0].index + with open("tests/files/example_fileset_file_ingest_result.json", "r") as f: + ingest_fileset_file_importer.bezerk_mode = True + counts = JsonLinePusher(ingest_fileset_file_importer, f).run() + assert counts["insert"] == 16 + assert counts["exists"] == 0 + assert counts["skip"] == 4 + assert counts["skip-bad-hashes"] == 4 + + # fetch most recent editgroup + change = ingest_fileset_file_importer.api.get_changelog_entry(index=last_index + 1) + eg = change.editgroup + assert eg.description + assert "crawled from web" in eg.description.lower() + assert eg.extra["git_rev"] + assert "fatcat_tools.IngestFilesetFileResultImporter" in eg.extra["agent"] + + # re-insert; should skip + with open("tests/files/example_fileset_file_ingest_result.json", "r") as f: + ingest_fileset_file_importer.reset() + ingest_fileset_file_importer.bezerk_mode = False + counts = JsonLinePusher(ingest_fileset_file_importer, f).run() + assert counts["insert"] == 0 + assert counts["exists"] == 16 + assert counts["skip"] == 4 + assert counts["skip-bad-hashes"] == 4 |