From 21eb32b976f71738745115244c3c0be49faa8648 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 22 Sep 2018 18:39:12 -0700 Subject: changes to crossref importer (and tests) --- python/fatcat/crossref_importer.py | 125 +++++++++++++++++++++++++++---------- python/tests/crossref.py | 35 +++++++++++ 2 files changed, 127 insertions(+), 33 deletions(-) diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 3c6ba3ef..99fed8d5 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -50,24 +50,38 @@ class FatcatCrossrefImporter(FatcatImporter): return None # contribs - contribs = [] - for i, am in enumerate(obj['author']): - creator_id = None - if 'ORCID' in am.keys(): - creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) - # Sorry humans :( - if am.get('given') and am.get('family'): - raw_name = "{} {}".format(am['given'], am['family']) - elif am.get('family'): - raw_name = am['family'] - else: - # TODO: defaults back to a pseudo-null value - raw_name = am.get('given', '') - contribs.append(fatcat_client.ReleaseContrib( - creator_id=creator_id, - index=i+1, - raw_name=raw_name, - role="author")) + def do_contribs(obj_list, ctype): + contribs = [] + for i, am in enumerate(obj_list): + creator_id = None + if 'ORCID' in am.keys(): + creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) + # Sorry humans :( + if am.get('given') and am.get('family'): + raw_name = "{} {}".format(am['given'], am['family']) + elif am.get('family'): + raw_name = am['family'] + else: + # TODO: defaults back to a pseudo-null value + raw_name = am.get('given', '') + extra = None + if ctype == "author": + index = i + else: + index = None + if am.get('affiliation'): + # note: affiliation => affiliations + extra = dict(affiliations=am.get('affiliation')) + contribs.append(fatcat_client.ReleaseContrib( + creator_id=creator_id, + index=index, + raw_name=raw_name, + role=ctype, + extra=extra)) + return contribs + contribs = do_contribs(obj['author'], "author") + contribs.extend(do_contribs(obj.get('editor', []), "editor")) + contribs.extend(do_contribs(obj.get('translator', []), "translator")) # container issn = obj.get('ISSN', [None])[0] @@ -95,20 +109,39 @@ class FatcatCrossrefImporter(FatcatImporter): year = None except: year = None - extra = dict(crossref=rm) - if rm.get('DOI') != None: + extra = rm.copy() + if rm.get('DOI'): extra['doi'] = rm.get('DOI').lower() + key = rm.get('key') + if key and key.startswith(obj['DOI'].upper()): + key = key.replace(obj['DOI'].upper() + "-", '') + key = key.replace(obj['DOI'].upper(), '') + container_name = rm.get('volume-title') + if not container_name: + container_name = rm.get('journal-title') + extra.pop('DOI', None) + extra.pop('key', None) + extra.pop('year', None) + extra.pop('volume-name', None) + extra.pop('journal-title', None) + extra.pop('title', None) + extra.pop('first-page', None) + extra.pop('doi-asserted-by', None) + if extra: + extra = dict(crossref=extra) + else: + extra = None refs.append(fatcat_client.ReleaseRef( - index=i+1, + index=i, # doing lookups would be a second import pass target_release_id=None, - # unreliable for crossref: key=rm['key'].split('|')[-1], + key=key, year=year, - container_title=rm.get('volume-title'), + container_name=container_name, title=rm.get('title'), locator=rm.get('first-page'), # TODO: just dump JSON somewhere here? - extra=dict(crossref=rm))) + extra=extra)) # abstracts abstracts = [] @@ -117,14 +150,37 @@ class FatcatCrossrefImporter(FatcatImporter): mimetype="application/xml+jats", content=obj.get('abstract'))) - # release - extra = dict(crossref={ - # TODO: if exsits: group_title, subtitle, isPreprintOf - 'links': obj.get('link', []), - 'subject': obj.get('subject'), - 'type': obj['type'], - 'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None, - 'alternative-id': obj.get('alternative-id', [])}) + # extra fields + extra = dict() + for key in ('subject', 'type', 'license', 'alternative-id', + 'container-title', 'original-title', 'subtitle', 'archive', + 'funder', 'group-title'): + val = obj.get(key) + if val: + extra[key] = val + if 'license' in extra and extra['license']: + for i in range(len(extra['license'])): + if 'start' in extra['license'][i]: + extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if len(obj['title']) > 1: + extra['other-titles'] = obj['title'][1:] + extra['is_kept'] = len(obj.get('archive', [])) > 0 + + # ISBN + isbn13 = None + for raw in obj.get('ISBN', []): + # TODO: convert if not ISBN-13 format + if len(raw) == 17: + isbn13 = raw + break + + # release status + if obj['type'] in ('journal-article', 'conference-proceeding', 'book', + 'dissertation', 'book-chapter'): + release_status = "published" + else: + # unknown + release_status = None # external identifiers extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) @@ -135,8 +191,11 @@ class FatcatCrossrefImporter(FatcatImporter): contribs=contribs, refs=refs, container_id=container_id, + publisher=publisher, release_type=obj['type'], + release_status=release_status, doi=obj['DOI'].lower(), + isbn13=isbn13, core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], @@ -146,7 +205,7 @@ class FatcatCrossrefImporter(FatcatImporter): volume=obj.get('volume'), pages=obj.get('page'), abstracts=abstracts, - extra=extra) + extra=dict(crossref=extra)) return (re, ce) def create_row(self, row, editgroup=None): diff --git a/python/tests/crossref.py b/python/tests/crossref.py index e9814da2..59be9886 100644 --- a/python/tests/crossref.py +++ b/python/tests/crossref.py @@ -1,4 +1,5 @@ +import json import pytest from fatcat.crossref_importer import FatcatCrossrefImporter @@ -20,3 +21,37 @@ def test_crossref_importer_create(crossref_importer): crossref_importer.create_containers = True with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f: crossref_importer.process_source(f) + +def test_crossref_dict_parse(crossref_importer): + with open('tests/files/crossref-works.single.json', 'r') as f: + # not a single line + raw = json.loads(f.read()) + (r, c) = crossref_importer.parse_crossref_dict(raw) + extra = r.extra['crossref'] + assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" + assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t" + assert r.publisher == "Wiley-Blackwell" + print(extra) + assert extra['container-title'] == ["International Journal of Quantum Chemistry"] + assert r.release_type == "journal-article" + assert r.release_status == "published" + assert r.isbn13 == "978-3-16-148410-0" + assert 'subtitle' not in extra + assert 'archive' not in extra + assert 'funder' not in extra + assert len(r.contribs) == 5 + assert r.contribs[0].raw_name == "Marcelo D. Radicioni" + assert r.contribs[0].index == 0 + assert r.contribs[1].extra['affiliations'] == ["Some University"] + assert r.contribs[1].role == "author" + assert r.contribs[3].role == "editor" + assert r.contribs[3].index is None + assert r.contribs[4].role == "translator" + assert r.contribs[4].index is None + assert len(r.refs) == 25 + assert r.refs[0].key == "BIB1" + assert r.refs[0].year == 1972 + assert r.refs[0].locator == "1734" + assert r.refs[0].container_name == "J. Chem. Phys." + assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"} + assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry" -- cgit v1.2.3