diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat/crossref_importer.py | 125 | ||||
| -rw-r--r-- | python/tests/crossref.py | 35 | 
2 files changed, 127 insertions, 33 deletions
| diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py index 3c6ba3ef..99fed8d5 100644 --- a/python/fatcat/crossref_importer.py +++ b/python/fatcat/crossref_importer.py @@ -50,24 +50,38 @@ class FatcatCrossrefImporter(FatcatImporter):              return None          # contribs -        contribs = [] -        for i, am in enumerate(obj['author']): -            creator_id = None -            if 'ORCID' in am.keys(): -                creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) -            # Sorry humans :( -            if am.get('given') and am.get('family'): -                raw_name = "{} {}".format(am['given'], am['family']) -            elif am.get('family'): -                raw_name = am['family'] -            else: -                # TODO: defaults back to a pseudo-null value -                raw_name = am.get('given', '<blank>') -            contribs.append(fatcat_client.ReleaseContrib( -                creator_id=creator_id, -                index=i+1, -                raw_name=raw_name, -                role="author")) +        def do_contribs(obj_list, ctype): +            contribs = [] +            for i, am in enumerate(obj_list): +                creator_id = None +                if 'ORCID' in am.keys(): +                    creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) +                # Sorry humans :( +                if am.get('given') and am.get('family'): +                    raw_name = "{} {}".format(am['given'], am['family']) +                elif am.get('family'): +                    raw_name = am['family'] +                else: +                    # TODO: defaults back to a pseudo-null value +                    raw_name = am.get('given', '<blank>') +                extra = None +                if ctype == "author": +                    index = i +                else: +                    index = None +                if am.get('affiliation'): +                    # note: affiliation => affiliations +                    extra = dict(affiliations=am.get('affiliation')) +                contribs.append(fatcat_client.ReleaseContrib( +                    creator_id=creator_id, +                    index=index, +                    raw_name=raw_name, +                    role=ctype, +                    extra=extra)) +            return contribs +        contribs = do_contribs(obj['author'], "author") +        contribs.extend(do_contribs(obj.get('editor', []), "editor")) +        contribs.extend(do_contribs(obj.get('translator', []), "translator"))          # container          issn = obj.get('ISSN', [None])[0] @@ -95,20 +109,39 @@ class FatcatCrossrefImporter(FatcatImporter):                      year = None              except:                  year = None -            extra = dict(crossref=rm) -            if rm.get('DOI') != None: +            extra = rm.copy() +            if rm.get('DOI'):                  extra['doi'] = rm.get('DOI').lower() +            key = rm.get('key') +            if key and key.startswith(obj['DOI'].upper()): +                key = key.replace(obj['DOI'].upper() + "-", '') +                key = key.replace(obj['DOI'].upper(), '') +            container_name = rm.get('volume-title') +            if not container_name: +                container_name = rm.get('journal-title') +            extra.pop('DOI', None) +            extra.pop('key', None) +            extra.pop('year', None) +            extra.pop('volume-name', None) +            extra.pop('journal-title', None) +            extra.pop('title', None) +            extra.pop('first-page', None) +            extra.pop('doi-asserted-by', None) +            if extra: +                extra = dict(crossref=extra) +            else: +                extra = None              refs.append(fatcat_client.ReleaseRef( -                index=i+1, +                index=i,                  # doing lookups would be a second import pass                  target_release_id=None, -                # unreliable for crossref: key=rm['key'].split('|')[-1], +                key=key,                  year=year, -                container_title=rm.get('volume-title'), +                container_name=container_name,                  title=rm.get('title'),                  locator=rm.get('first-page'),                  # TODO: just dump JSON somewhere here? -                extra=dict(crossref=rm))) +                extra=extra))          # abstracts          abstracts = [] @@ -117,14 +150,37 @@ class FatcatCrossrefImporter(FatcatImporter):                  mimetype="application/xml+jats",                  content=obj.get('abstract'))) -        # release -        extra = dict(crossref={ -            # TODO: if exsits: group_title, subtitle, isPreprintOf -            'links': obj.get('link', []), -            'subject': obj.get('subject'), -            'type': obj['type'], -            'license': obj.get('license', [dict(URL=None)])[0]['URL'] or None, -            'alternative-id': obj.get('alternative-id', [])}) +        # extra fields +        extra = dict() +        for key in ('subject', 'type', 'license', 'alternative-id', +                'container-title', 'original-title', 'subtitle', 'archive', +                'funder', 'group-title'): +            val = obj.get(key) +            if val: +                extra[key] = val +        if 'license' in extra and extra['license']: +            for i in range(len(extra['license'])): +                if 'start' in extra['license'][i]: +                    extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] +        if len(obj['title']) > 1: +            extra['other-titles'] = obj['title'][1:] +        extra['is_kept'] = len(obj.get('archive', [])) > 0 + +        # ISBN +        isbn13 = None +        for raw in obj.get('ISBN', []): +            # TODO: convert if not ISBN-13 format +            if len(raw) == 17: +                isbn13 = raw +                break + +        # release status +        if obj['type'] in ('journal-article', 'conference-proceeding', 'book', +                'dissertation', 'book-chapter'): +            release_status = "published" +        else: +            # unknown +            release_status = None          # external identifiers          extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) @@ -135,8 +191,11 @@ class FatcatCrossrefImporter(FatcatImporter):              contribs=contribs,              refs=refs,              container_id=container_id, +            publisher=publisher,              release_type=obj['type'], +            release_status=release_status,              doi=obj['DOI'].lower(), +            isbn13=isbn13,              core_id=extids['core_id'],              pmid=extids['pmid'],              pmcid=extids['pmcid'], @@ -146,7 +205,7 @@ class FatcatCrossrefImporter(FatcatImporter):              volume=obj.get('volume'),              pages=obj.get('page'),              abstracts=abstracts, -            extra=extra) +            extra=dict(crossref=extra))          return (re, ce)      def create_row(self, row, editgroup=None): diff --git a/python/tests/crossref.py b/python/tests/crossref.py index e9814da2..59be9886 100644 --- a/python/tests/crossref.py +++ b/python/tests/crossref.py @@ -1,4 +1,5 @@ +import json  import pytest  from fatcat.crossref_importer import FatcatCrossrefImporter @@ -20,3 +21,37 @@ def test_crossref_importer_create(crossref_importer):      crossref_importer.create_containers = True      with open('tests/files/crossref-works.2018-01-21.badsample.json', 'r') as f:          crossref_importer.process_source(f) + +def test_crossref_dict_parse(crossref_importer): +    with open('tests/files/crossref-works.single.json', 'r') as f: +        # not a single line +        raw = json.loads(f.read()) +        (r, c) = crossref_importer.parse_crossref_dict(raw) +        extra = r.extra['crossref'] +        assert r.title == "Renormalized perturbation theory by the moment method for degenerate states: Anharmonic oscillators" +        assert r.doi == "10.1002/(sici)1097-461x(1998)66:4<261::aid-qua1>3.0.co;2-t" +        assert r.publisher == "Wiley-Blackwell" +        print(extra) +        assert extra['container-title'] == ["International Journal of Quantum Chemistry"] +        assert r.release_type == "journal-article" +        assert r.release_status == "published" +        assert r.isbn13 == "978-3-16-148410-0" +        assert 'subtitle' not in extra +        assert 'archive' not in extra +        assert 'funder' not in extra +        assert len(r.contribs) == 5 +        assert r.contribs[0].raw_name == "Marcelo D. Radicioni" +        assert r.contribs[0].index == 0 +        assert r.contribs[1].extra['affiliations'] == ["Some University"] +        assert r.contribs[1].role == "author" +        assert r.contribs[3].role == "editor" +        assert r.contribs[3].index is None +        assert r.contribs[4].role == "translator" +        assert r.contribs[4].index is None +        assert len(r.refs) == 25 +        assert r.refs[0].key == "BIB1" +        assert r.refs[0].year == 1972 +        assert r.refs[0].locator == "1734" +        assert r.refs[0].container_name == "J. Chem. Phys." +        assert r.refs[0].extra['crossref'] == {"volume": "57", "author": "Swenson", "doi": "10.1063/1.1678462"} +        assert r.refs[3].container_name == "Large Order Perturbation Theory and Summation Methods in Quantum Mechanics, Lecture Notes in Chemistry" | 
