summaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-11-21 11:58:46 -0800
committerBryan Newbold <bnewbold@robocracy.org>2018-11-21 11:58:46 -0800
commit7ec413416acb2b3d7da0be32b78982316b9c696f (patch)
treecc0799316a0875d7aea6f1d9fddc03fb5e505410 /python
parent008366697aba8046fd33ae1f3707972d87c9a342 (diff)
downloadfatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.tar.gz
fatcat-7ec413416acb2b3d7da0be32b78982316b9c696f.zip
crossref importer checks for existing DOIs
Diffstat (limited to 'python')
-rw-r--r--python/README_import.md2
-rwxr-xr-xpython/fatcat_import.py10
-rw-r--r--python/fatcat_tools/importers/crossref.py17
-rw-r--r--python/fatcat_tools/importers/matched.py6
-rw-r--r--python/tests/import_crossref.py16
5 files changed, 42 insertions, 9 deletions
diff --git a/python/README_import.md b/python/README_import.md
index 9ee24f8e..cc9a94e1 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -57,7 +57,7 @@ Usually 24 hours or so on fast production machine.
Unknown speed!
# No file update for the first import...
- zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update -
+ zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates -
# ... but do on the second
zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched -
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index aad4ee57..fe5b24a6 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -8,7 +8,8 @@ from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \
def run_crossref(args):
fci = CrossrefImporter(args.host_url, args.issn_map_file,
- args.extid_map_file, create_containers=(not args.no_create_containers))
+ args.extid_map_file, create_containers=(not args.no_create_containers),
+ check_existing=(not args.no_release_updates))
if args.kafka_mode:
consumer = make_kafka_consumer(
args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import")
@@ -29,7 +30,7 @@ def run_issn(args):
def run_matched(args):
fmi = MatchedImporter(args.host_url,
- skip_file_update=args.no_file_update)
+ skip_file_updates=args.no_file_updates)
fmi.process_batch(args.json_file, size=args.batch_size)
fmi.describe_run()
@@ -74,6 +75,9 @@ def main():
sub_crossref.add_argument('--kafka-mode',
action='store_true',
help="consume from kafka topic (not stdin)")
+ sub_crossref.add_argument('--no-release-updates',
+ action='store_true',
+ help="don't lookup existing DOIs, just insert (only for bootstrap)")
sub_orcid = subparsers.add_parser('orcid')
sub_orcid.set_defaults(func=run_orcid)
@@ -98,7 +102,7 @@ def main():
sub_matched.add_argument('json_file',
help="JSON file to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_matched.add_argument('--no-file-update',
+ sub_matched.add_argument('--no-file-updates',
action='store_true',
help="don't lookup existing files, just insert (only for bootstrap)")
sub_matched.add_argument('--batch-size',
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 475afdb0..385a8235 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -40,7 +40,7 @@ class CrossrefImporter(FatcatImporter):
See https://github.com/CrossRef/rest-api-doc for JSON schema notes
"""
- def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
+ def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True):
super().__init__(host_url, issn_map_file)
self.extid_map_db = None
if extid_map_file:
@@ -50,6 +50,7 @@ class CrossrefImporter(FatcatImporter):
else:
print("Not using external ID map")
self.create_containers = create_containers
+ self.check_existing = check_existing
def lookup_ext_ids(self, doi):
if self.extid_map_db is None:
@@ -85,6 +86,20 @@ class CrossrefImporter(FatcatImporter):
'book-track', 'proceedings-series'):
return None
+ # lookup existing DOI
+ existing_release = None
+ if self.check_existing:
+ try:
+ existing_release = self.api.lookup_release(doi=obj['DOI'].lower())
+ except fatcat_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing_release:
+ return None
+
# contribs
def do_contribs(obj_list, ctype):
contribs = []
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 732fccbe..6f83dd23 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -37,12 +37,12 @@ class MatchedImporter(FatcatImporter):
- core_id, wikidata_id, pmcid, pmid: not as lists
"""
- def __init__(self, host_url, skip_file_update=False, default_mime=None,
+ def __init__(self, host_url, skip_file_updates=False, default_mime=None,
default_link_rel="web"):
super().__init__(host_url)
self.default_mime = default_mime
self.default_link_rel = default_link_rel
- self.skip_file_update = skip_file_update
+ self.skip_file_updates = skip_file_updates
def make_url(self, raw):
rel = self.default_link_rel
@@ -61,7 +61,7 @@ class MatchedImporter(FatcatImporter):
# lookup sha1, or create new entity
fe = None
- if not self.skip_file_update:
+ if not self.skip_file_updates:
try:
fe = self.api.lookup_file(sha1=sha1)
except fatcat_client.rest.ApiException as err:
diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py
index c129e729..1fb4a70f 100644
--- a/python/tests/import_crossref.py
+++ b/python/tests/import_crossref.py
@@ -7,7 +7,12 @@ from fatcat_tools.importers import CrossrefImporter
@pytest.fixture(scope="function")
def crossref_importer():
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
+ yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=False)
+
+@pytest.fixture(scope="function")
+def crossref_importer_existing():
+ with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+ yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=True)
def test_crossref_importer_batch(crossref_importer):
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
@@ -61,3 +66,12 @@ def test_crossref_dict_parse(crossref_importer):
assert r.refs[0].container_name == "J. Chem. Phys."
assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"}
assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry"
+
+def test_stateful_checking(crossref_importer_existing):
+ with open('tests/files/crossref-works.single.json', 'r') as f:
+ # not a single line, a whole document
+ raw = json.loads(f.read())
+ # might not exist yet...
+ crossref_importer_existing.process_source([json.dumps(raw)])
+ # ok, make sure we get 'None' back
+ assert crossref_importer_existing.parse_crossref_dict(raw) is None