From 9ab88508ed710de9db06a27436042ac30a70676e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 22 Jan 2019 16:27:27 -0800 Subject: crossref importer updates --- python/fatcat_tools/importers/crossref.py | 97 +++++++++++++++++++++------ python/tests/api_releases.py | 2 +- python/tests/files/crossref-works.single.json | 2 +- python/tests/import_crossref.py | 3 +- 4 files changed, 82 insertions(+), 22 deletions(-) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 4b9199cd..8953dd82 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -32,6 +32,31 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } +CONTAINER_TYPE_MAP = { + 'article-journal': 'journal', + 'paper-conference': 'conference', + 'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { + "http://creativecommons.org/licenses/by/3.0/": "CC-BY", + "http://creativecommons.org/licenses/by/4.0/": "CC-BY", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # http://www.springer.com/tdm doesn't seem like a license +} + class CrossrefImporter(FatcatImporter): """ Importer for Crossref metadata. @@ -66,17 +91,21 @@ class CrossrefImporter(FatcatImporter): def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], - wikidata_qid=row[3]) + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def map_release_type(self, crossref_type): return CROSSREF_TYPE_MAP.get(crossref_type) @@ -98,6 +127,8 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None + release_type = self.map_release_type(obj['type']) + # lookup existing DOI existing_release = None if self.check_existing: @@ -132,9 +163,13 @@ class CrossrefImporter(FatcatImporter): index = i else: index = None + raw_affiliation = None if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') + if len(am.get('affiliation')) > 0: + raw_affiliation = am.get('affiliation')[0]['name'] + if len(am.get('affiliation')) > 1: + # note: affiliation => affiliations + extra['affiliations'] = [a['name'] for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": extra['sequence'] = am.get('sequence') if not extra: @@ -144,6 +179,7 @@ class CrossrefImporter(FatcatImporter): creator_id=creator_id, index=index, raw_name=raw_name, + raw_affiliation=raw_affiliation, role=ctype, extra=extra)) return contribs @@ -165,8 +201,19 @@ class CrossrefImporter(FatcatImporter): ce = fatcat_client.ContainerEntity( issnl=issnl, publisher=publisher, + container_type=CONTAINER_TYPE_MAP.get(release_type), name=obj['container-title'][0]) + # license slug + license_slug = None + for l in obj.get('license', []): + if l['content-version'] not in ('vor', 'unspecified'): + continue + slug = LICENSE_SLUG_MAP.get(l['URL']) + if slug: + license_slug = slug + break + # references refs = [] for i, rm in enumerate(obj.get('reference', [])): @@ -188,10 +235,14 @@ class CrossrefImporter(FatcatImporter): container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') + ref_locator = rm.get('first-page') + ref_title = rm.get('title') + if extra.get('DOI'): + extra['doi'] = extra['DOI'] extra.pop('DOI', None) extra.pop('key', None) extra.pop('year', None) - extra.pop('volume-name', None) + extra.pop('volume-title', None) extra.pop('journal-title', None) extra.pop('title', None) extra.pop('first-page', None) @@ -207,8 +258,8 @@ class CrossrefImporter(FatcatImporter): key=key, year=year, container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), + title=ref_title, + locator=ref_locator, # TODO: just dump JSON somewhere here? extra=extra)) @@ -277,26 +328,32 @@ class CrossrefImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( work_id=None, - title=obj.get('title', [None])[0], - contribs=contribs, - refs=refs, container_id=container_id, - publisher=publisher, - release_type=self.map_release_type(obj['type']), + title=obj.get('title', [None])[0], + original_title=obj.get('original-title', [None])[0], + release_type=release_type, release_status=release_status, + release_date=release_date, + release_year=release_year, + publisher=publisher, doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - release_year=release_year, - issue=obj.get('issue'), + isbn13=isbn13, + core_id=extids['core_id'], + arxiv_id=extids['arxiv_id'], + jstor_id=extids['jstor_id'], volume=obj.get('volume'), + issue=obj.get('issue'), pages=obj.get('page'), + language=None, # crossref doesn't supply language info + license_slug=license_slug, + extra=dict(crossref=extra), abstracts=abstracts, - extra=dict(crossref=extra)) + contribs=contribs, + refs=refs, + ) return (re, ce) def create_row(self, row, editgroup_id=None): @@ -304,6 +361,8 @@ class CrossrefImporter(FatcatImporter): return obj = json.loads(row) entities = self.parse_crossref_dict(obj) + # XXX: + print(entities) if entities is not None: (re, ce) = entities if ce is not None: diff --git a/python/tests/api_releases.py b/python/tests/api_releases.py index d5b31ad3..36774745 100644 --- a/python/tests/api_releases.py +++ b/python/tests/api_releases.py @@ -19,7 +19,7 @@ def test_release(api): original_title="оригинальное название", release_type="post-weblog", release_status="pre-print", - #release_date=datetime.datetime.utcnow(), + # XXX: release_date=datetime.datetime.utcnow(), release_year=2015, doi="10.5555/12345678", pmid="12345", diff --git a/python/tests/files/crossref-works.single.json b/python/tests/files/crossref-works.single.json index 2af2b358..e3d2e05c 100644 --- a/python/tests/files/crossref-works.single.json +++ b/python/tests/files/crossref-works.single.json @@ -84,7 +84,7 @@ { "given": "Carlos G.", "family": "Diaz", - "affiliation": ["Some University"] + "affiliation": [{"name": "Some University"}, {"name": "Some Department"}] }, { "given": "Francisco M.", diff --git a/python/tests/import_crossref.py b/python/tests/import_crossref.py index e2ca6122..89ce9fc9 100644 --- a/python/tests/import_crossref.py +++ b/python/tests/import_crossref.py @@ -61,7 +61,8 @@ def test_crossref_dict_parse(crossref_importer): assert len(r.contribs) == 5 assert r.contribs[0].raw_name == "Marcelo D. Radicioni" assert r.contribs[0].index == 0 - assert r.contribs[1].extra['affiliations'] == ["Some University"] + assert r.contribs[1].raw_affiliation == "Some University" + assert r.contribs[1].extra['affiliations'] == ["Some Department"] assert r.contribs[1].role == "author" assert r.contribs[3].role == "editor" assert r.contribs[3].index is None -- cgit v1.2.3