aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-12 11:06:06 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-12 11:06:06 -0700
commit722e4321bfa6431136b1e03f0c9e4ac11903c4df (patch)
treefb688bf672f209616b0f8049b4749cbe8da0d3fa
parent1db3bbe11139cee2034cc3f632d13bcce2c03a89 (diff)
downloadfatcat-722e4321bfa6431136b1e03f0c9e4ac11903c4df.tar.gz
fatcat-722e4321bfa6431136b1e03f0c9e4ac11903c4df.zip
extid support for crossref importer
-rw-r--r--python/fatcat/crossref_importer.py31
-rwxr-xr-xpython/fatcat_import.py7
-rw-r--r--python/tests/crossref.py2
-rw-r--r--python/tests/files/example_map.sqlite3bin0 -> 72704 bytes
4 files changed, 36 insertions, 4 deletions
diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py
index a59d0a45..a2e14ed1 100644
--- a/python/fatcat/crossref_importer.py
+++ b/python/fatcat/crossref_importer.py
@@ -1,6 +1,7 @@
import sys
import json
+import sqlite3
import itertools
import fatcat_client
from fatcat.importer_common import FatcatImporter
@@ -8,10 +9,31 @@ from fatcat.importer_common import FatcatImporter
class FatcatCrossrefImporter(FatcatImporter):
- def __init__(self, host_url, issn_map_file, create_containers=True):
+ def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):
super().__init__(host_url, issn_map_file)
+ self.extid_map_db = None
+ if extid_map_file:
+ db_uri = "file:{}?mode=ro".format(extid_map_file)
+ print("Using external ID map: {}".format(db_uri))
+ self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+ else:
+ print("Not using external ID map")
self.create_containers = create_containers
+ def lookup_ext_ids(self, doi):
+ if self.extid_map_db is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+ [doi.lower()]).fetchone()
+ if row is None:
+ return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None)
+ row = [str((cell or None)) for cell in row]
+ return dict(
+ core_id=row[0],
+ pmid=row[1],
+ pmcid=row[2],
+ wikidata_qid=row[3])
+
def parse_crossref_dict(self, obj):
"""
obj is a python dict (parsed from json).
@@ -103,6 +125,9 @@ class FatcatCrossrefImporter(FatcatImporter):
'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None,
'alternative-id': obj.get('alternative-id', [])})
+ # external identifiers
+ extids = self.lookup_ext_ids(doi=obj['DOI'].lower())
+
re = fatcat_client.ReleaseEntity(
work_id=None,
title=obj['title'][0],
@@ -111,6 +136,10 @@ class FatcatCrossrefImporter(FatcatImporter):
container_id=container_id,
release_type=obj['type'],
doi=obj['DOI'].lower(),
+ core_id=extids['core_id'],
+ pmid=extids['pmid'],
+ pmcid=extids['pmcid'],
+ wikidata_qid=extids['wikidata_qid'],
release_date=obj['created']['date-time'],
issue=obj.get('issue'),
volume=obj.get('volume'),
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 2804a210..2f0c746f 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -10,7 +10,7 @@ from fatcat.issn_importer import FatcatIssnImporter
def run_import_crossref(args):
fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file,
- create_containers=(not args.no_create_containers))
+ args.extid_map_file, create_containers=(not args.no_create_containers))
fci.process_batch(args.json_file, size=args.batch_size)
def run_import_orcid(args):
@@ -46,7 +46,10 @@ def main():
default=sys.stdin, type=argparse.FileType('r'))
sub_import_crossref.add_argument('issn_map_file',
help="ISSN to ISSN-L mapping file",
- default=sys.stdin, type=argparse.FileType('r'))
+ default=None, type=argparse.FileType('r'))
+ sub_import_crossref.add_argument('extid_map_file',
+ help="DOI-to-other-identifiers sqlite3 database",
+ default=None, type=argparse.FileType('r'))
sub_import_crossref.add_argument('--no-create-containers',
action='store_true',
help="skip creation of new container entities based on ISSN")
diff --git a/python/tests/crossref.py b/python/tests/crossref.py
index 402195d7..e9814da2 100644
--- a/python/tests/crossref.py
+++ b/python/tests/crossref.py
@@ -6,7 +6,7 @@ from fatcat.crossref_importer import FatcatCrossrefImporter
@pytest.fixture(scope="function")
def crossref_importer():
with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
- yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file)
+ yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')
def test_crossref_importer_batch(crossref_importer):
with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:
diff --git a/python/tests/files/example_map.sqlite3 b/python/tests/files/example_map.sqlite3
new file mode 100644
index 00000000..e9aa1bbe
--- /dev/null
+++ b/python/tests/files/example_map.sqlite3
Binary files differ