diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:06 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-12 11:06:06 -0700 | 
| commit | 722e4321bfa6431136b1e03f0c9e4ac11903c4df (patch) | |
| tree | fb688bf672f209616b0f8049b4749cbe8da0d3fa | |
| parent | 1db3bbe11139cee2034cc3f632d13bcce2c03a89 (diff) | |
| download | fatcat-722e4321bfa6431136b1e03f0c9e4ac11903c4df.tar.gz fatcat-722e4321bfa6431136b1e03f0c9e4ac11903c4df.zip | |
extid support for crossref importer
| -rw-r--r-- | python/fatcat/crossref_importer.py | 31 | ||||
| -rwxr-xr-x | python/fatcat_import.py | 7 | ||||
| -rw-r--r-- | python/tests/crossref.py | 2 | ||||
| -rw-r--r-- | python/tests/files/example_map.sqlite3 | bin | 0 -> 72704 bytes | 
4 files changed, 36 insertions, 4 deletions
| diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index a59d0a45..a2e14ed1 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -1,6 +1,7 @@  import sys  import json +import sqlite3  import itertools  import fatcat_client  from fatcat.importer_common import FatcatImporter @@ -8,10 +9,31 @@ from fatcat.importer_common import FatcatImporter  class FatcatCrossrefImporter(FatcatImporter): -    def __init__(self, host_url, issn_map_file, create_containers=True): +    def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True):          super().__init__(host_url, issn_map_file) +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri)) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map")          self.create_containers = create_containers +    def lookup_ext_ids(self, doi): +        if self.extid_map_db is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", +            [doi.lower()]).fetchone() +        if row is None: +            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) +        row = [str((cell or None)) for cell in row] +        return dict( +            core_id=row[0], +            pmid=row[1], +            pmcid=row[2], +            wikidata_qid=row[3]) +      def parse_crossref_dict(self, obj):          """          obj is a python dict (parsed from json). @@ -103,6 +125,9 @@ class FatcatCrossrefImporter(FatcatImporter):              'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None,              'alternative-id': obj.get('alternative-id', [])}) +        # external identifiers +        extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) +          re = fatcat_client.ReleaseEntity(              work_id=None,              title=obj['title'][0], @@ -111,6 +136,10 @@ class FatcatCrossrefImporter(FatcatImporter):              container_id=container_id,              release_type=obj['type'],              doi=obj['DOI'].lower(), +            core_id=extids['core_id'], +            pmid=extids['pmid'], +            pmcid=extids['pmcid'], +            wikidata_qid=extids['wikidata_qid'],              release_date=obj['created']['date-time'],              issue=obj.get('issue'),              volume=obj.get('volume'), diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 2804a210..2f0c746f 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -10,7 +10,7 @@ from fatcat.issn_importer import FatcatIssnImporter  def run_import_crossref(args):      fci = FatcatCrossrefImporter(args.host_url, args.issn_map_file, -        create_containers=(not args.no_create_containers)) +        args.extid_map_file, create_containers=(not args.no_create_containers))      fci.process_batch(args.json_file, size=args.batch_size)  def run_import_orcid(args): @@ -46,7 +46,10 @@ def main():          default=sys.stdin, type=argparse.FileType('r'))      sub_import_crossref.add_argument('issn_map_file',          help="ISSN to ISSN-L mapping file", -        default=sys.stdin, type=argparse.FileType('r')) +        default=None, type=argparse.FileType('r')) +    sub_import_crossref.add_argument('extid_map_file', +        help="DOI-to-other-identifiers sqlite3 database", +        default=None, type=argparse.FileType('r'))      sub_import_crossref.add_argument('--no-create-containers',          action='store_true',          help="skip creation of new container entities based on ISSN") diff --git a/python/tests/crossref.py b/python/tests/crossref.py index 402195d7..e9814da2 100644 --- a/python/tests/crossref.py +++ b/python/tests/crossref.py @@ -6,7 +6,7 @@ from fatcat.crossref_importer import FatcatCrossrefImporter  @pytest.fixture(scope="function")  def crossref_importer():      with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file: -        yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file) +        yield FatcatCrossrefImporter("http://localhost:9411/v0", issn_file, 'tests/files/example_map.sqlite3')  def test_crossref_importer_batch(crossref_importer):      with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: diff --git a/python/tests/files/example_map.sqlite3 b/python/tests/files/example_map.sqlite3Binary files differ new file mode 100644 index 00000000..e9aa1bbe --- /dev/null +++ b/python/tests/files/example_map.sqlite3 | 
