From 7ec413416acb2b3d7da0be32b78982316b9c696f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 21 Nov 2018 11:58:46 -0800 Subject: crossref importer checks for existing DOIs --- python/README_import.md | 2 +- python/fatcat_import.py | 10 +++++++--- python/fatcat_tools/importers/crossref.py | 17 ++++++++++++++++- python/fatcat_tools/importers/matched.py | 6 +++--- python/tests/import_crossref.py | 16 +++++++++++++++- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/python/README_import.md b/python/README_import.md index 9ee24f8e..cc9a94e1 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -57,7 +57,7 @@ Usually 24 hours or so on fast production machine. Unknown speed! # No file update for the first import... - zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-update - + zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched --no-file-updates - # ... but do on the second zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py matched - diff --git a/python/fatcat_import.py b/python/fatcat_import.py index aad4ee57..fe5b24a6 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -8,7 +8,8 @@ from fatcat_tools.importers import CrossrefImporter, OrcidImporter, \ def run_crossref(args): fci = CrossrefImporter(args.host_url, args.issn_map_file, - args.extid_map_file, create_containers=(not args.no_create_containers)) + args.extid_map_file, create_containers=(not args.no_create_containers), + check_existing=(not args.no_release_updates)) if args.kafka_mode: consumer = make_kafka_consumer( args.kafka_hosts, args.kafka_env, "api-crossref", "fatcat-import") @@ -29,7 +30,7 @@ def run_issn(args): def run_matched(args): fmi = MatchedImporter(args.host_url, - skip_file_update=args.no_file_update) + skip_file_updates=args.no_file_updates) fmi.process_batch(args.json_file, size=args.batch_size) fmi.describe_run() @@ -74,6 +75,9 @@ def main(): sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") + sub_crossref.add_argument('--no-release-updates', + action='store_true', + help="don't lookup existing DOIs, just insert (only for bootstrap)") sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults(func=run_orcid) @@ -98,7 +102,7 @@ def main(): sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) - sub_matched.add_argument('--no-file-update', + sub_matched.add_argument('--no-file-updates', action='store_true', help="don't lookup existing files, just insert (only for bootstrap)") sub_matched.add_argument('--batch-size', diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 475afdb0..385a8235 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -40,7 +40,7 @@ class CrossrefImporter(FatcatImporter): See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ - def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): + def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True, check_existing=True): super().__init__(host_url, issn_map_file) self.extid_map_db = None if extid_map_file: @@ -50,6 +50,7 @@ class CrossrefImporter(FatcatImporter): else: print("Not using external ID map") self.create_containers = create_containers + self.check_existing = check_existing def lookup_ext_ids(self, doi): if self.extid_map_db is None: @@ -85,6 +86,20 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None + # lookup existing DOI + existing_release = None + if self.check_existing: + try: + existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing_release: + return None + # contribs def do_contribs(obj_list, ctype): contribs = [] diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 732fccbe..6f83dd23 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -37,12 +37,12 @@ class MatchedImporter(FatcatImporter): - core_id, wikidata_id, pmcid, pmid: not as lists """ - def __init__(self, host_url, skip_file_update=False, default_mime=None, + def __init__(self, host_url, skip_file_updates=False, default_mime=None, default_link_rel="web"): super().__init__(host_url) self.default_mime = default_mime self.default_link_rel = default_link_rel - self.skip_file_update = skip_file_update + self.skip_file_updates = skip_file_updates def make_url(self, raw): rel = self.default_link_rel @@ -61,7 +61,7 @@ class MatchedImporter(FatcatImporter): # lookup sha1, or create new entity fe = None - if not self.skip_file_update: + if not self.skip_file_updates: try: fe = self.api.lookup_file(sha1=sha1) except fatcat_client.rest.ApiException as err: diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index c129e729..1fb4a70f 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -7,7 +7,12 @@ from fatcat_tools.importers import CrossrefImporter @pytest.fixture(scope="function") def crossref_importer(): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3') + yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=False) + +@pytest.fixture(scope="function") +def crossref_importer_existing(): + with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: + yield CrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3', check_existing=True) def test_crossref_importer_batch(crossref_importer): with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: @@ -61,3 +66,12 @@ def test_crossref_dict_parse(crossref_importer): assert r.refs[0].container_name == "J. Chem. Phys." assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"} assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry" + +def test_stateful_checking(crossref_importer_existing): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line, a whole document + raw = json.loads(f.read()) + # might not exist yet... + crossref_importer_existing.process_source([json.dumps(raw)]) + # ok, make sure we get 'None' back + assert crossref_importer_existing.parse_crossref_dict(raw) is None -- cgit v1.2.3