From 722e4321bfa6431136b1e03f0c9e4ac11903c4df Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Sep 2018 11:06:06 -0700 Subject: extid support for crossref importer --- python/fatcat/crossref_importer.py | 31 ++++++++++++++++++++++++++++++- python/fatcat_import.py | 7 +++++-- python/tests/crossref.py | 2 +- python/tests/files/example_map.sqlite3 | Bin 0 -> 72704 bytes 4 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 python/tests/files/example_map.sqlite3 diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index a59d0a45..a2e14ed1 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -1,6 +1,7 @@ import sys import json +import sqlite3 import itertools import fatcat_client from fatcat.importer_common import FatcatImporter @@ -8,10 +9,31 @@ from fatcat.importer_common import FatcatImporter class FatcatCrossrefImporter(FatcatImporter): - def __init__(self, host_url, issn_map_file, create_containers=True): + def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): super().__init__(host_url, issn_map_file) + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri)) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map") self.create_containers = create_containers + def lookup_ext_ids(self, doi): + if self.extid_map_db is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + row = [str((cell or None)) for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3]) + def parse_crossref_dict(self, obj): """ obj is a python dict (parsed from json). @@ -103,6 +125,9 @@ class FatcatCrossrefImporter(FatcatImporter): 'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None, 'alternative-id': obj.get('alternative-id', [])}) + # external identifiers + extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) + re = fatcat_client.ReleaseEntity( work_id=None, title=obj['title'][0], @@ -111,6 +136,10 @@ class FatcatCrossrefImporter(FatcatImporter): container_id=container_id, release_type=obj['type'], doi=obj['DOI'].lower(), + core_id=extids['core_id'], + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], release_date=obj['created']['date-time'], issue=obj.get('issue'), volume=obj.get('volume'), diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 2804a210..2f0c746f 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -10,7 +10,7 @@ from fatcat.issn_importer import FatcatIssnImporter def run_import_crossref(args): fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, - create_containers=(not args.no_create_containers)) + args.extid_map_file, create_containers=(not args.no_create_containers)) fci.process_batch(args.json_file, size=args.batch_size) def run_import_orcid(args): @@ -46,7 +46,10 @@ def main(): default=sys.stdin, type=argparse.FileType('r')) sub_import_crossref.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", - default=sys.stdin, type=argparse.FileType('r')) + default=None, type=argparse.FileType('r')) + sub_import_crossref.add_argument('extid_map_file', + help="DOI-to-other-identifiers sqlite3 database", + default=None, type=argparse.FileType('r')) sub_import_crossref.add_argument('--no-create-containers', action='store_true', help="skip creation of new container entities based on ISSN") diff --git a/python/tests/crossref.py b/python/tests/crossref.py index 402195d7..e9814da2 100644 --- a/python/tests/crossref.py +++ b/python/tests/crossref.py @@ -6,7 +6,7 @@ from fatcat.crossref_importer import FatcatCrossrefImporter @pytest.fixture(scope="function") def crossref_importer(): with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: - yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file) + yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3') def test_crossref_importer_batch(crossref_importer): with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: diff --git a/python/tests/files/example_map.sqlite3 b/python/tests/files/example_map.sqlite3 new file mode 100644 index 00000000..e9aa1bbe Binary files /dev/null and b/python/tests/files/example_map.sqlite3 differ -- cgit v1.2.3