diff options
Diffstat (limited to 'python/fatcat_tools/importers/crossref.py')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 263 |
1 files changed, 154 insertions, 109 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 6365e491..00c719f1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -6,7 +6,7 @@ import datetime import itertools import subprocess import fatcat_client -from .common import FatcatImporter +from .common import EntityImporter, clean # The docs/guide should be the cannonical home for these mappings; update there @@ -32,7 +32,32 @@ CROSSREF_TYPE_MAP = { 'standard': 'standard', } -class CrossrefImporter(FatcatImporter): +CONTAINER_TYPE_MAP = { + 'article-journal': 'journal', + 'paper-conference': 'conference', + 'book': 'book-series', +} + +# TODO: +LICENSE_SLUG_MAP = { + "http://creativecommons.org/licenses/by/3.0/": "CC-BY", + "http://creativecommons.org/licenses/by/4.0/": "CC-BY", + "http://creativecommons.org/licenses/by-sa/3.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "http://creativecommons.org/licenses/by-nd/3.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "http://creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "http://creativecommons.org/licenses/by-nc-sa/3.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "http://creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "http://creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-NC-ND", + "http://www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", + # http://onlinelibrary.wiley.com/termsAndConditions doesn't seem like a license + # http://www.springer.com/tdm doesn't seem like a license +} + +class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -51,9 +76,9 @@ class CrossrefImporter(FatcatImporter): issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra) + + self.create_containers = kwargs.get('create_containers') extid_map_file = kwargs.get('extid_map_file') - create_containers = kwargs.get('create_containers') - check_existing = kwargs.get('check_existing') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -61,36 +86,46 @@ class CrossrefImporter(FatcatImporter): self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map") - self.create_containers = create_containers - self.check_existing = check_existing + + self.read_issn_map_file(issn_map_file) def lookup_ext_ids(self, doi): if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], - wikidata_qid=row[3]) + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) def map_release_type(self, crossref_type): return CROSSREF_TYPE_MAP.get(crossref_type) - def parse_crossref_dict(self, obj): + def map_container_type(self, crossref_type): + return CONTAINER_TYPE_MAP.get(crossref_type) + + def want(self, obj): + if not obj.get('title'): + return False + + # do most of these checks in-line below + return True + + def parse_record(self, obj): """ obj is a python dict (parsed from json). returns a ReleaseEntity """ - # Do require the 'title' keys to exsit, as release entities do - if (not 'title' in obj) or (not obj['title']): - return None - # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now if obj.get('type') in (None, 'journal', 'proceedings', @@ -98,20 +133,12 @@ class CrossrefImporter(FatcatImporter): 'book-track', 'proceedings-series'): return None - # lookup existing DOI - existing_release = None - if self.check_existing: - try: - existing_release = self.api.lookup_release(doi=obj['DOI'].lower()) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - - # eventually we'll want to support "updates", but for now just skip if - # entity already exists - if existing_release: + # Do require the 'title' keys to exsit, as release entities do + if (not 'title' in obj) or (not obj['title']): return None + release_type = self.map_release_type(obj['type']) + # contribs def do_contribs(obj_list, ctype): contribs = [] @@ -132,18 +159,23 @@ class CrossrefImporter(FatcatImporter): index = i else: index = None + raw_affiliation = None if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') + if len(am.get('affiliation')) > 0: + raw_affiliation = am.get('affiliation')[0]['name'] + if len(am.get('affiliation')) > 1: + # note: affiliation => more_affiliations + extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] if am.get('sequence') and am.get('sequence') != "additional": - extra['sequence'] = am.get('sequence') + extra['seq'] = clean(am.get('sequence')) if not extra: extra = None assert ctype in ("author", "editor", "translator") contribs.append(fatcat_client.ReleaseContrib( creator_id=creator_id, index=index, - raw_name=raw_name, + raw_name=clean(raw_name), + raw_affiliation=clean(raw_affiliation), role=ctype, extra=extra)) return contribs @@ -159,28 +191,40 @@ class CrossrefImporter(FatcatImporter): container_id = self.lookup_issnl(issnl) publisher = obj.get('publisher') - ce = None if (container_id is None and self.create_containers and (issnl is not None) and obj.get('container-title') and len(obj['container-title']) > 0): ce = fatcat_client.ContainerEntity( issnl=issnl, - publisher=publisher, - name=obj['container-title'][0]) + publisher=clean(publisher), + container_type=self.map_container_type(release_type), + name=clean(obj['container-title'][0], force_xml=True)) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + + # license slug + license_slug = None + license_extra = [] + for l in obj.get('license', []): + if l['content-version'] not in ('vor', 'unspecified'): + continue + slug = LICENSE_SLUG_MAP.get(l['URL']) + if slug: + license_slug = slug + if 'start' in l: + l['start'] = l['start']['date-time'] + license_extra.append(l) # references refs = [] for i, rm in enumerate(obj.get('reference', [])): try: year = int(rm.get('year')) - # NOTE: will need to update/config in the future! + # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year > 2025 or year < 100: year = None except: year = None - extra = rm.copy() - if rm.get('DOI'): - extra['doi'] = rm.get('DOI').lower() key = rm.get('key') if key and key.startswith(obj['DOI'].upper()): key = key.replace(obj['DOI'].upper() + "-", '') @@ -188,14 +232,18 @@ class CrossrefImporter(FatcatImporter): container_name = rm.get('volume-title') if not container_name: container_name = rm.get('journal-title') - extra.pop('DOI', None) - extra.pop('key', None) - extra.pop('year', None) - extra.pop('volume-name', None) - extra.pop('journal-title', None) - extra.pop('title', None) - extra.pop('first-page', None) - extra.pop('doi-asserted-by', None) + elif rm.get('journal-title'): + extra['journal-title'] = rm['journal-title'] + extra = dict() + if rm.get('DOI'): + extra['doi'] = rm.get('DOI').lower() + # TODO: what fields here? CSL citation stuff + for k in ('author', 'editor', 'edition', 'authority', 'version', + 'genre', 'url', 'event', 'issue', 'volume', 'date', + 'accessed_date', 'issued', 'page', 'medium', + 'collection_title', 'chapter_number'): + if clean(rm.get(k)): + extra[k] = clean(rm[k]) if extra: extra = dict(crossref=extra) else: @@ -206,9 +254,9 @@ class CrossrefImporter(FatcatImporter): target_release_id=None, key=key, year=year, - container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), + container_name=clean(container_name), + title=clean(rm.get('title')), + locator=clean(rm.get('first-page')), # TODO: just dump JSON somewhere here? extra=extra)) @@ -217,25 +265,24 @@ class CrossrefImporter(FatcatImporter): if obj.get('abstract') != None: abstracts.append(fatcat_client.ReleaseEntityAbstracts( mimetype="application/xml+jats", - content=obj.get('abstract'))) + content=clean(obj.get('abstract')))) # extra fields extra = dict() - for key in ('subject', 'type', 'license', 'alternative-id', - 'container-title', 'original-title', 'subtitle', 'archive', - 'funder', 'group-title'): - # TODO: unpack "container-title" array + for key in ('subject', 'type', 'alternative-id', 'container-title', + 'subtitle', 'archive', 'funder', 'group-title'): + # TODO: unpack "container-title" array? val = obj.get(key) if val: - extra[key] = val - if 'license' in extra and extra['license']: - for i in range(len(extra['license'])): - if 'start' in extra['license'][i]: - extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if type(val) == str: + extra[key] = clean(val) + else: + extra[key] = val + if license_extra: + extra['license'] = license_extra + if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] - # TODO: this should be top-level - extra['is_kept'] = len(obj.get('archive', [])) > 0 + extra['other-titles'] = [clean(t) for t in obj['title'][1:]] # ISBN isbn13 = None @@ -277,59 +324,57 @@ class CrossrefImporter(FatcatImporter): re = fatcat_client.ReleaseEntity( work_id=None, - title=obj.get('title', [None])[0], - contribs=contribs, - refs=refs, container_id=container_id, - publisher=publisher, - release_type=self.map_release_type(obj['type']), + title=clean(obj.get('title', [None])[0], force_xml=True), + original_title=clean(obj.get('original-title', [None])[0]), + release_type=release_type, release_status=release_status, + release_date=release_date, + release_year=release_year, + publisher=clean(publisher), doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - release_year=release_year, - issue=obj.get('issue'), - volume=obj.get('volume'), - pages=obj.get('page'), + isbn13=isbn13, + core_id=extids['core_id'], + arxiv_id=extids['arxiv_id'], + jstor_id=extids['jstor_id'], + volume=clean(obj.get('volume')), + issue=clean(obj.get('issue')), + pages=clean(obj.get('page')), + language=None, # crossref doesn't supply language info + license_slug=license_slug, + extra=dict(crossref=extra), abstracts=abstracts, - extra=dict(crossref=extra)) - return (re, ce) + contribs=contribs, + refs=refs, + ) + return re + + def try_update(self, re): + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + self.api.create_release_batch(batch, + autoaccept=True, + description=self.editgroup_description, + extra=json.dumps(self.editgroup_extra)) - def create_row(self, row, editgroup_id=None): - if row is None: - return - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - container = self.api.create_container(ce, editgroup_id=editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - self.api.create_release(re, editgroup_id=editgroup_id) - self.counts['insert'] += 1 - - def create_batch(self, batch): - """Current work/release pairing disallows batch creation of releases. - Could do batch work creation and then match against releases, but meh.""" - release_batch = [] - for row in batch: - if row is None: - continue - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - ce_eg = self.api.create_editgroup(fatcat_client.Editgroup()) - container = self.api.create_container(ce, editgroup_id=ce_eg.editgroup_id) - self.api.accept_editgroup(ce_eg.editgroup_id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - release_batch.append(re) - self.api.create_release_batch(release_batch, autoaccept="true") - self.counts['insert'] += len(release_batch) |