aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/import_ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-04-07 14:44:01 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-04-07 14:44:01 -0700
commitede98644a89afd15d903061e0998dbd08851df6d (patch)
tree17c54c5764adb2f5d67aa750174f635e0fb1cdc8 /python/tests/import_ingest.py
parent2ef72e0c769e94401568ab42def30ddb5268fa98 (diff)
parent0aaa2a839d7a14716ee1a84b730203a7953dc5e0 (diff)
downloadfatcat-ede98644a89afd15d903061e0998dbd08851df6d.tar.gz
fatcat-ede98644a89afd15d903061e0998dbd08851df6d.zip
Merge branch 'bnewbold-dataset-ingest-fixes'
Diffstat (limited to 'python/tests/import_ingest.py')
-rw-r--r--python/tests/import_ingest.py130
1 files changed, 127 insertions, 3 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index d9e7d294..65549d1d 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -5,6 +5,8 @@ from fixtures import *
from fatcat_tools.importers import (
IngestFileResultImporter,
+ IngestFilesetFileResultImporter,
+ IngestFilesetResultImporter,
IngestWebResultImporter,
JsonLinePusher,
)
@@ -20,6 +22,16 @@ def ingest_web_importer(api):
yield IngestWebResultImporter(api)
+@pytest.fixture(scope="function")
+def ingest_fileset_importer(api):
+ yield IngestFilesetResultImporter(api)
+
+
+@pytest.fixture(scope="function")
+def ingest_fileset_file_importer(api):
+ yield IngestFilesetFileResultImporter(api)
+
+
# TODO: use API to check that entities actually created...
def test_ingest_importer_basic(ingest_importer):
with open("tests/files/example_ingest.json", "r") as f:
@@ -58,7 +70,6 @@ def test_ingest_importer_xml(ingest_importer):
with open("tests/files/example_ingest_xml.json", "r") as f:
ingest_importer.bezerk_mode = True
counts = JsonLinePusher(ingest_importer, f).run()
- print(counts)
assert counts["insert"] == 1
assert counts["exists"] == 0
assert counts["skip"] == 0
@@ -86,7 +97,6 @@ def test_ingest_importer_web(ingest_web_importer):
with open("tests/files/example_ingest_html.json", "r") as f:
ingest_web_importer.bezerk_mode = True
counts = JsonLinePusher(ingest_web_importer, f).run()
- print(counts)
assert counts["insert"] == 1
assert counts["exists"] == 0
assert counts["skip"] == 0
@@ -139,7 +149,6 @@ def test_ingest_importer_stage(ingest_importer, api):
ingest_importer.reset()
ingest_importer.push_record(raw)
counts = ingest_importer.finish()
- print(counts)
assert counts["total"] == 1
assert counts[row["status"]] == 1
@@ -182,3 +191,118 @@ def test_ingest_dict_parse_old(ingest_importer):
if u.rel == "webarchive":
assert u.url.startswith("https://web.archive.org/")
assert len(f.release_ids) == 1
+
+
+def test_ingest_fileset_dict_parse(ingest_fileset_importer):
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ raw = json.loads(f.readline())
+ fs = ingest_fileset_importer.parse_record(raw)
+ assert len(fs.manifest) == 3
+ assert fs.manifest[0].sha1 == "c0669e84e7b9052cc0f342e8ce7d31d59956326a"
+ assert fs.manifest[0].md5 == "caf4d9fc2c6ebd0d9251ac84e0b6b006"
+ assert fs.manifest[0].mimetype == "application/x-hdf"
+ assert fs.manifest[0].size == 16799750
+ assert fs.manifest[0].path == "N2 on food R_2010_03_25__10_53_27___4___1_features.hdf5"
+ assert (
+ fs.manifest[0].extra["original_url"]
+ == "https://zenodo.org/api/files/563203f6-6de5-46d9-b305-ba42604f2508/N2%20on%20food%20R_2010_03_25__10_53_27___4___1_features.hdf5"
+ )
+ assert len(fs.urls) == 2
+ matched = 0
+ for u in fs.urls:
+ if u.rel == "web":
+ assert u.url == "https://zenodo.org/record/1028059"
+ matched += 1
+ if u.rel == "archive-base":
+ assert u.url == "https://archive.org/download/zenodo.org-1028059/"
+ matched += 1
+ assert matched == 2
+ assert len(fs.release_ids) == 1
+
+
+def test_ingest_fileset_importer(ingest_fileset_importer):
+ last_index = ingest_fileset_importer.api.get_changelog(limit=1)[0].index
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ ingest_fileset_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_fileset_importer, f).run()
+ assert counts["insert"] == 7
+ assert counts["exists"] == 0
+ assert counts["skip"] == 13
+ assert counts["skip-release-not-found"] == 13
+
+ # fetch most recent editgroup
+ change = ingest_fileset_importer.api.get_changelog_entry(index=last_index + 1)
+ eg = change.editgroup
+ assert eg.description
+ assert "filesets crawled from web" in eg.description.lower()
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.IngestFilesetResultImporter" in eg.extra["agent"]
+
+ # re-insert; should skip
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ ingest_fileset_importer.reset()
+ ingest_fileset_importer.bezerk_mode = False
+ counts = JsonLinePusher(ingest_fileset_importer, f).run()
+
+ assert counts["insert"] == 0
+ assert counts["exists"] == 7
+ assert counts["skip"] == 13
+ assert counts["skip-release-not-found"] == 13
+
+
+def test_ingest_fileset_file_dict_parse(ingest_fileset_file_importer):
+ with open("tests/files/example_fileset_file_ingest_result.json", "r") as f:
+ raw = json.loads(f.readline())
+ fe = ingest_fileset_file_importer.parse_record(raw)
+ assert fe.sha1 == "6fb020064da66bb7a666c17555611cf6820fc9ae"
+ assert fe.md5 == "dfc41b617564f99a12e6077a6208876f"
+ assert fe.sha256 == "2febad53ff0f163a18d7cbb913275bf99ed2544730cda191458837e2b0da9d18"
+ assert fe.mimetype == "image/tiff"
+ assert fe.size == 410631015
+ assert fe.extra["path"] == "NDVI_Diff_1990_2018_T06.tif"
+ assert len(fe.urls) == 2
+ matched = 0
+ for u in fe.urls:
+ if u.rel == "web":
+ assert u.url == "https://ndownloader.figshare.com/files/14460875"
+ matched += 1
+ if u.rel == "archive":
+ assert (
+ u.url
+ == "https://archive.org/download/springernature.figshare.com-7767695-v1/NDVI_Diff_1990_2018_T06.tif"
+ )
+ matched += 1
+ assert matched == 2
+ assert len(fe.release_ids) == 1
+
+
+def test_ingest_fileset_file_importer(ingest_fileset_file_importer):
+ """
+ Similar to the above, but specifically tests 'file'/'success-file' import pathway
+ """
+ last_index = ingest_fileset_file_importer.api.get_changelog(limit=1)[0].index
+ with open("tests/files/example_fileset_file_ingest_result.json", "r") as f:
+ ingest_fileset_file_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_fileset_file_importer, f).run()
+ assert counts["insert"] == 16
+ assert counts["exists"] == 0
+ assert counts["skip"] == 4
+ assert counts["skip-bad-hashes"] == 4
+
+ # fetch most recent editgroup
+ change = ingest_fileset_file_importer.api.get_changelog_entry(index=last_index + 1)
+ eg = change.editgroup
+ assert eg.description
+ assert "crawled from web" in eg.description.lower()
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.IngestFilesetFileResultImporter" in eg.extra["agent"]
+
+ # re-insert; should skip
+ with open("tests/files/example_fileset_file_ingest_result.json", "r") as f:
+ ingest_fileset_file_importer.reset()
+ ingest_fileset_file_importer.bezerk_mode = False
+ counts = JsonLinePusher(ingest_fileset_file_importer, f).run()
+ assert counts["insert"] == 0
+ assert counts["exists"] == 16
+ assert counts["skip"] == 4
+ assert counts["skip-bad-hashes"] == 4