aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/import_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/import_ingest.py')
-rw-r--r--python/tests/import_ingest.py63
1 files changed, 60 insertions, 3 deletions
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index d9e7d294..d86d6d7b 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -5,6 +5,7 @@ from fixtures import *
from fatcat_tools.importers import (
IngestFileResultImporter,
+ IngestFilesetResultImporter,
IngestWebResultImporter,
JsonLinePusher,
)
@@ -20,6 +21,11 @@ def ingest_web_importer(api):
yield IngestWebResultImporter(api)
+@pytest.fixture(scope="function")
+def ingest_fileset_importer(api):
+ yield IngestFilesetResultImporter(api)
+
+
# TODO: use API to check that entities actually created...
def test_ingest_importer_basic(ingest_importer):
with open("tests/files/example_ingest.json", "r") as f:
@@ -58,7 +64,6 @@ def test_ingest_importer_xml(ingest_importer):
with open("tests/files/example_ingest_xml.json", "r") as f:
ingest_importer.bezerk_mode = True
counts = JsonLinePusher(ingest_importer, f).run()
- print(counts)
assert counts["insert"] == 1
assert counts["exists"] == 0
assert counts["skip"] == 0
@@ -86,7 +91,6 @@ def test_ingest_importer_web(ingest_web_importer):
with open("tests/files/example_ingest_html.json", "r") as f:
ingest_web_importer.bezerk_mode = True
counts = JsonLinePusher(ingest_web_importer, f).run()
- print(counts)
assert counts["insert"] == 1
assert counts["exists"] == 0
assert counts["skip"] == 0
@@ -139,7 +143,6 @@ def test_ingest_importer_stage(ingest_importer, api):
ingest_importer.reset()
ingest_importer.push_record(raw)
counts = ingest_importer.finish()
- print(counts)
assert counts["total"] == 1
assert counts[row["status"]] == 1
@@ -182,3 +185,57 @@ def test_ingest_dict_parse_old(ingest_importer):
if u.rel == "webarchive":
assert u.url.startswith("https://web.archive.org/")
assert len(f.release_ids) == 1
+
+
+def test_ingest_fileset_dict_parse(ingest_fileset_importer):
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ raw = json.loads(f.readline())
+ fs = ingest_fileset_importer.parse_record(raw)
+ assert len(fs.manifest) == 3
+ assert fs.manifest[0].sha1 == "c0669e84e7b9052cc0f342e8ce7d31d59956326a"
+ assert fs.manifest[0].md5 == "caf4d9fc2c6ebd0d9251ac84e0b6b006"
+ assert fs.manifest[0].extra["mimetype"] == "application/x-hdf"
+ assert fs.manifest[0].size == 16799750
+ assert fs.manifest[0].path == "N2 on food R_2010_03_25__10_53_27___4___1_features.hdf5"
+ assert (
+ fs.manifest[0].extra["original_url"]
+ == "https://zenodo.org/api/files/563203f6-6de5-46d9-b305-ba42604f2508/N2%20on%20food%20R_2010_03_25__10_53_27___4___1_features.hdf5"
+ )
+ assert len(fs.urls) == 2
+ for u in fs.urls:
+ if u.rel == "web":
+ assert u.url == "https://zenodo.org/record/1028059"
+ if u.rel == "archive-base":
+ assert u.url == "https://archive.org/download/zenodo.org-1028059/"
+ assert len(fs.release_ids) == 1
+
+
+def test_ingest_fileset_importer(ingest_fileset_importer):
+ last_index = ingest_fileset_importer.api.get_changelog(limit=1)[0].index
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ ingest_fileset_importer.bezerk_mode = True
+ counts = JsonLinePusher(ingest_fileset_importer, f).run()
+ assert counts["insert"] == 7
+ assert counts["exists"] == 0
+ assert counts["skip"] == 13
+ assert counts["skip-release-not-found"] == 13
+
+ # fetch most recent editgroup
+ change = ingest_fileset_importer.api.get_changelog_entry(index=last_index + 1)
+ eg = change.editgroup
+ assert eg.description
+ assert "filesets crawled from web" in eg.description.lower()
+ assert eg.extra["git_rev"]
+ assert "fatcat_tools.IngestFilesetResultImporter" in eg.extra["agent"]
+
+ # re-insert; should skip
+ with open("tests/files/example_fileset_ingest_result.json", "r") as f:
+ ingest_fileset_importer.reset()
+ ingest_fileset_importer.bezerk_mode = False
+ counts = JsonLinePusher(ingest_fileset_importer, f).run()
+
+ assert counts["insert"] == 0
+ assert counts["exists"] == 0
+ assert counts["skip"] == 20
+ assert counts["skip-release-not-found"] == 13
+ assert counts["skip-release-has-fileset"] == 7