aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2020-05-28 00:36:35 +0000
committerMartin Czygan <martin@archive.org>2020-05-28 00:36:35 +0000
commit92bdf2bd5d99ce6eb76ea1dcec27e21c6f362651 (patch)
tree5b48a6132a469bc430dc51ee0e57aa2f6f241861 /python
parent174cf39a02b85c69ada9bea609be2fc06c172e68 (diff)
parent9bbc8f4ceec54563d71c68564b35ceb14ac60840 (diff)
downloadfatcat-92bdf2bd5d99ce6eb76ea1dcec27e21c6f362651.tar.gz
fatcat-92bdf2bd5d99ce6eb76ea1dcec27e21c6f362651.zip
Merge branch 'bnewbold-ingest-stage' into 'master'
verify release_stage in ingest importer See merge request webgroup/fatcat!52
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/ingest.py5
-rw-r--r--python/tests/files/example_ingest.json3
-rw-r--r--python/tests/import_ingest.py45
3 files changed, 46 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 6cf1604b..2b630e67 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -109,6 +109,11 @@ class IngestFileResultImporter(EntityImporter):
self.counts['warn-extid-invalid'] += 1
continue
raise err
+ # verify release_stage
+ if request.get('release_stage') and release.release_stage:
+ if request['release_stage'] != release.release_stage:
+ self.counts['skip-release-stage'] += 1
+ return None
release_ident = release.ident
break
if self.use_glutton_match and not release_ident and row.get('grobid'):
diff --git a/python/tests/files/example_ingest.json b/python/tests/files/example_ingest.json
index 18804c1b..cea67fa7 100644
--- a/python/tests/files/example_ingest.json
+++ b/python/tests/files/example_ingest.json
@@ -1 +1,2 @@
-{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"project": "unit-tests", "ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 } }
+{"file_meta": {"sha1hex": "00242a192acc258bdfdb151943419437f440c313", "md5hex": "f4de91152c7ab9fdc2a128f962faebff", "sha256hex": "ffc1005680cb620eec4c913437dfabbf311b535cfe16cbaeb2faec1f92afc362", "size_bytes": 255629, "mimetype": "application/pdf"}, "request": {"ingest_request_source": "fatcat-changelog", "link_source": "doi", "link_source_id":"10.123/abc","ext_ids": {"doi": "10.123/abc"}}, "terminal": {"terminal_url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable", "terminal_dt": "20170227164644", "terminal_sha1hex": "00242a192acc258bdfdb151943419437f440c313"}, "cdx": { "datetime": "20170227164644", "url": "http://journals.plos.org/plosmedicine/article/file?id=10.1371/journal.pmed.0020124&type=printable" }, "grobid": {"status_code": 200 }, "hit": true, "status": "success"}
+{"request":{"ingest_type":"pdf","ingest_request_source":"fatcat-changelog","base_url":"https://doi.org/10.3917/popav.748.0017","release_stage":"published","fatcat":{"release_ident":"weeqjkvsx5abze2bhithyrx6wu","work_ident":"ujatsk25yrdw5gofubw7nogzgq"},"ext_ids":{"doi":"10.3917/popav.748.0017"},"link_source":"doi","link_source_id":"10.3917/popav.748.0017"},"hit":false,"hops":["https://doi.org/10.3917/popav.748.0017"],"status":"wayback-error","error_message":"replay fetch didn't return X-Archive-Src in headers"}
diff --git a/python/tests/import_ingest.py b/python/tests/import_ingest.py
index 5089d99e..02486de6 100644
--- a/python/tests/import_ingest.py
+++ b/python/tests/import_ingest.py
@@ -2,7 +2,7 @@
import json
import pytest
from fatcat_tools.importers import IngestFileResultImporter, JsonLinePusher
-from fixtures import api
+from fixtures import *
@pytest.fixture(scope="function")
@@ -14,15 +14,14 @@ def test_ingest_importer_basic(ingest_importer):
with open('tests/files/example_ingest.json', 'r') as f:
JsonLinePusher(ingest_importer, f).run()
-@pytest.mark.skip("tests not flushed out yet")
def test_ingest_importer(ingest_importer):
last_index = ingest_importer.api.get_changelog(limit=1)[0].index
with open('tests/files/example_ingest.json', 'r') as f:
ingest_importer.bezerk_mode = True
counts = JsonLinePusher(ingest_importer, f).run()
- assert counts['insert'] == 2
+ assert counts['insert'] == 1
assert counts['exists'] == 0
- assert counts['skip'] == 11
+ assert counts['skip'] == 1
# fetch most recent editgroup
change = ingest_importer.api.get_changelog_entry(index=last_index+1)
@@ -38,8 +37,42 @@ def test_ingest_importer(ingest_importer):
ingest_importer.bezerk_mode = False
counts = JsonLinePusher(ingest_importer, f).run()
assert counts['insert'] == 0
- assert counts['exists'] == 2
- assert counts['skip'] == 11
+ assert counts['exists'] == 1
+ assert counts['skip'] == 1
+
+def test_ingest_importer_stage(ingest_importer, api):
+ """
+ Tests that ingest importer correctly handles release stage matching
+ """
+ test_table = [
+ dict(request_stage=None, release_stage=None, status="insert"),
+ dict(request_stage="published", release_stage=None, status="insert"),
+ dict(request_stage=None, release_stage="draft", status="insert"),
+ dict(request_stage="published", release_stage="published", status="insert"),
+ dict(request_stage="draft", release_stage="published", status="skip-release-stage"),
+ dict(request_stage="published", release_stage="draft", status="skip-release-stage"),
+ ]
+ ingest_importer.bezerk_mode = True
+ with open('tests/files/example_ingest.json', 'r') as f:
+ raw = json.loads(f.readline())
+ for row in test_table:
+ print(row)
+
+ # set dummy record stage
+ eg = quick_eg(api)
+ r1 = api.lookup_release(doi="10.123/abc")
+ r1.release_stage = row['release_stage']
+ c1 = api.update_release(eg.editgroup_id, r1.ident, r1)
+ api.accept_editgroup(eg.editgroup_id)
+
+ # set ingest request stage
+ raw['request']['release_stage'] = row['request_stage']
+ ingest_importer.reset()
+ ingest_importer.push_record(raw)
+ counts = ingest_importer.finish()
+ print(counts)
+ assert counts["total"] == 1
+ assert counts[row['status']] == 1
def test_ingest_dict_parse(ingest_importer):
with open('tests/files/example_ingest.json', 'r') as f: