From 31d1a6a713d177990609767d508209ced19ca396 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 2 Nov 2021 18:14:59 -0700 Subject: fmt (black): fatcat_tools/ --- python/fatcat_tools/importers/crossref.py | 413 ++++++++++++++++++------------ 1 file changed, 246 insertions(+), 167 deletions(-) (limited to 'python/fatcat_tools/importers/crossref.py') diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index fd6936a4..606d4bb1 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,4 +1,3 @@ - import datetime import sqlite3 from typing import Any, Dict, Optional @@ -13,30 +12,30 @@ from .common import EntityImporter, clean # Can get a list of Crossref types (with counts) via API: # https://api.crossref.org/works?rows=0&facet=type-name:* CROSSREF_TYPE_MAP: Dict[str, Optional[str]] = { - 'book': 'book', - 'book-chapter': 'chapter', - 'book-part': 'chapter', - 'book-section': 'chapter', - 'component': 'component', - 'dataset': 'dataset', - 'dissertation': 'thesis', - 'edited-book': 'book', - 'journal-article': 'article-journal', - 'monograph': 'book', - 'other': None, - 'peer-review': 'peer_review', - 'posted-content': 'post', - 'proceedings-article': 'paper-conference', - 'reference-book': 'book', - 'reference-entry': 'entry', - 'report': 'report', - 'standard': 'standard', + "book": "book", + "book-chapter": "chapter", + "book-part": "chapter", + "book-section": "chapter", + "component": "component", + "dataset": "dataset", + "dissertation": "thesis", + "edited-book": "book", + "journal-article": "article-journal", + "monograph": "book", + "other": None, + "peer-review": "peer_review", + "posted-content": "post", + "proceedings-article": "paper-conference", + "reference-book": "book", + "reference-entry": "entry", + "report": "report", + "standard": "standard", } CONTAINER_TYPE_MAP: Dict[str, str] = { - 'article-journal': 'journal', - 'paper-conference': 'conference', - 'book': 'book-series', + "article-journal": "journal", + "paper-conference": "conference", + "book": "book-series", } # These are based, informally, on sorting the most popular licenses found in @@ -90,29 +89,41 @@ LICENSE_SLUG_MAP: Dict[str, str] = { "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", } + def lookup_license_slug(raw: str) -> Optional[str]: if not raw: return None - raw = raw.strip().replace('http://', '//').replace('https://', '//') - if 'creativecommons.org' in raw.lower(): + raw = raw.strip().replace("http://", "//").replace("https://", "//") + if "creativecommons.org" in raw.lower(): raw = raw.lower() - raw = raw.replace('/legalcode', '/').replace('/uk', '') - if not raw.endswith('/'): - raw = raw + '/' + raw = raw.replace("/legalcode", "/").replace("/uk", "") + if not raw.endswith("/"): + raw = raw + "/" return LICENSE_SLUG_MAP.get(raw) + def test_lookup_license_slug(): assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" - assert lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") == "CC-0" + assert ( + lookup_license_slug("http://creativecommons.org/licenses/by/2.0/uk/legalcode") + == "CC-BY" + ) + assert ( + lookup_license_slug("https://creativecommons.org/publicdomain/zero/1.0/legalcode") + == "CC-0" + ) assert lookup_license_slug("http://creativecommons.org/licenses/by/4.0") == "CC-BY" - assert lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") == "CC-BY-NC-SA" + assert ( + lookup_license_slug("https://creativecommons.org/licenses/by-nc-sa/4.0/") + == "CC-BY-NC-SA" + ) assert lookup_license_slug("https://www.ametsoc.org/PUBSReuseLicenses") == "AMETSOC" assert lookup_license_slug("https://www.amec.org/PUBSReuseLicenses") is None assert lookup_license_slug("") is None assert lookup_license_slug(None) is None + class CrossrefImporter(EntityImporter): """ Importer for Crossref metadata. @@ -124,18 +135,22 @@ class CrossrefImporter(EntityImporter): def __init__(self, api, issn_map_file, **kwargs): - eg_desc: Optional[str] = kwargs.get('editgroup_description', - "Automated import of Crossref DOI metadata, harvested from REST API") - eg_extra: Optional[dict] = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.CrossrefImporter') - super().__init__(api, + eg_desc: Optional[str] = kwargs.get( + "editgroup_description", + "Automated import of Crossref DOI metadata, harvested from REST API", + ) + eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") + super().__init__( + api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, - **kwargs) + **kwargs + ) - self.create_containers: bool = kwargs.get('create_containers', True) - extid_map_file = kwargs.get('extid_map_file') + self.create_containers: bool = kwargs.get("create_containers", True) + extid_map_file = kwargs.get("extid_map_file") self.extid_map_db: Optional[Any] = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) @@ -148,12 +163,27 @@ class CrossrefImporter(EntityImporter): def lookup_ext_ids(self, doi: str) -> Optional[Any]: if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()] + ).fetchone() if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) - row = [str(cell or '') or None for cell in row] + return dict( + core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None, + ) + row = [str(cell or "") or None for cell in row] return dict( core_id=row[0], pmid=row[1], @@ -173,17 +203,17 @@ class CrossrefImporter(EntityImporter): return CONTAINER_TYPE_MAP.get(crossref_type) def want(self, obj: Dict[str, Any]) -> bool: - if not obj.get('title'): - self.counts['skip-blank-title'] += 1 + if not obj.get("title"): + self.counts["skip-blank-title"] += 1 return False # these are pre-registered DOIs before the actual record is ready # title is a list of titles - titles = obj.get('title') + titles = obj.get("title") if titles is not None and titles[0].strip().lower() in [ - "OUP accepted manuscript".lower(), - ]: - self.counts['skip-stub-title'] += 1 + "OUP accepted manuscript".lower(), + ]: + self.counts["skip-stub-title"] += 1 return False # do most of these checks in-line below @@ -197,86 +227,105 @@ class CrossrefImporter(EntityImporter): # Ways to be out of scope (provisionally) # journal-issue and journal-volume map to None, but allowed for now - if obj.get('type') in (None, 'journal', 'proceedings', - 'standard-series', 'report-series', 'book-series', 'book-set', - 'book-track', 'proceedings-series'): - self.counts['skip-release-type'] += 1 + if obj.get("type") in ( + None, + "journal", + "proceedings", + "standard-series", + "report-series", + "book-series", + "book-set", + "book-track", + "proceedings-series", + ): + self.counts["skip-release-type"] += 1 return None # Do require the 'title' keys to exist, as release entities do - if ('title' not in obj) or (not obj['title']): - self.counts['skip-blank-title'] += 1 + if ("title" not in obj) or (not obj["title"]): + self.counts["skip-blank-title"] += 1 return None - release_type = self.map_release_type(obj['type']) + release_type = self.map_release_type(obj["type"]) # contribs def do_contribs(obj_list, ctype): contribs = [] for i, am in enumerate(obj_list): creator_id = None - if 'ORCID' in am.keys(): - creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) + if "ORCID" in am.keys(): + creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( - if am.get('given') and am.get('family'): - raw_name = "{} {}".format(am['given'], am['family']) - elif am.get('family'): - raw_name = am['family'] + if am.get("given") and am.get("family"): + raw_name = "{} {}".format(am["given"], am["family"]) + elif am.get("family"): + raw_name = am["family"] else: # TODO: can end up empty - raw_name = am.get('name') or am.get('given') + raw_name = am.get("name") or am.get("given") extra = dict() if ctype == "author": index = i else: index = None raw_affiliation = None - if am.get('affiliation'): - if len(am.get('affiliation')) > 0: - raw_affiliation = am.get('affiliation')[0]['name'] - if len(am.get('affiliation')) > 1: + if am.get("affiliation"): + if len(am.get("affiliation")) > 0: + raw_affiliation = am.get("affiliation")[0]["name"] + if len(am.get("affiliation")) > 1: # note: affiliation => more_affiliations - extra['more_affiliations'] = [clean(a['name']) for a in am.get('affiliation')[1:]] - if am.get('sequence') and am.get('sequence') != "additional": - extra['seq'] = clean(am.get('sequence')) + extra["more_affiliations"] = [ + clean(a["name"]) for a in am.get("affiliation")[1:] + ] + if am.get("sequence") and am.get("sequence") != "additional": + extra["seq"] = clean(am.get("sequence")) if not extra: extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) - contribs.append(fatcat_openapi_client.ReleaseContrib( - creator_id=creator_id, - index=index, - raw_name=raw_name, - given_name=clean(am.get('given')), - surname=clean(am.get('family')), - raw_affiliation=clean(raw_affiliation), - role=ctype, - extra=extra)) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=index, + raw_name=raw_name, + given_name=clean(am.get("given")), + surname=clean(am.get("family")), + raw_affiliation=clean(raw_affiliation), + role=ctype, + extra=extra, + ) + ) return contribs - contribs = do_contribs(obj.get('author', []), "author") - contribs.extend(do_contribs(obj.get('editor', []), "editor")) - contribs.extend(do_contribs(obj.get('translator', []), "translator")) + + contribs = do_contribs(obj.get("author", []), "author") + contribs.extend(do_contribs(obj.get("editor", []), "editor")) + contribs.extend(do_contribs(obj.get("translator", []), "translator")) # container - issn = obj.get('ISSN', [None])[0] + issn = obj.get("ISSN", [None])[0] issnl = self.issn2issnl(issn) container_id = None if issnl: container_id = self.lookup_issnl(issnl) - publisher = clean(obj.get('publisher')) + publisher = clean(obj.get("publisher")) - container_name = obj.get('container-title') + container_name = obj.get("container-title") if container_name: container_name = clean(container_name[0], force_xml=True) if not container_name: container_name = None - if (container_id is None and self.create_containers and (issnl is not None) - and container_name): + if ( + container_id is None + and self.create_containers + and (issnl is not None) + and container_name + ): ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, publisher=publisher, container_type=self.map_container_type(release_type), - name=container_name) + name=container_name, + ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id @@ -284,21 +333,21 @@ class CrossrefImporter(EntityImporter): # license slug license_slug = None license_extra = [] - for lic in obj.get('license', []): - if lic['content-version'] not in ('vor', 'unspecified'): + for lic in obj.get("license", []): + if lic["content-version"] not in ("vor", "unspecified"): continue - slug = lookup_license_slug(lic['URL']) + slug = lookup_license_slug(lic["URL"]) if slug: license_slug = slug - if 'start' in lic: - lic['start'] = lic['start']['date-time'] + if "start" in lic: + lic["start"] = lic["start"]["date-time"] license_extra.append(lic) # references refs = [] - for i, rm in enumerate(obj.get('reference', [])): + for i, rm in enumerate(obj.get("reference", [])): try: - year: Optional[int] = int(rm.get('year')) + year: Optional[int] = int(rm.get("year")) # TODO: will need to update/config in the future! # NOTE: are there crossref works with year < 100? if year is not None: @@ -307,56 +356,78 @@ class CrossrefImporter(EntityImporter): except (TypeError, ValueError): year = None ref_extra: Dict[str, Any] = dict() - key = rm.get('key') - if key and key.startswith(obj['DOI'].upper()): - key = key.replace(obj['DOI'].upper() + "-", '') - key = key.replace(obj['DOI'].upper(), '') - ref_container_name = rm.get('volume-title') + key = rm.get("key") + if key and key.startswith(obj["DOI"].upper()): + key = key.replace(obj["DOI"].upper() + "-", "") + key = key.replace(obj["DOI"].upper(), "") + ref_container_name = rm.get("volume-title") if not ref_container_name: - ref_container_name = rm.get('journal-title') - elif rm.get('journal-title'): - ref_extra['journal-title'] = rm['journal-title'] - if rm.get('DOI'): - ref_extra['doi'] = rm.get('DOI').lower() - author = clean(rm.get('author')) + ref_container_name = rm.get("journal-title") + elif rm.get("journal-title"): + ref_extra["journal-title"] = rm["journal-title"] + if rm.get("DOI"): + ref_extra["doi"] = rm.get("DOI").lower() + author = clean(rm.get("author")) if author: - ref_extra['authors'] = [author] - for k in ('editor', 'edition', 'authority', 'version', 'genre', - 'url', 'event', 'issue', 'volume', 'date', 'accessed_date', - 'issued', 'page', 'medium', 'collection_title', 'chapter_number', - 'unstructured', 'series-title', 'volume-title'): + ref_extra["authors"] = [author] + for k in ( + "editor", + "edition", + "authority", + "version", + "genre", + "url", + "event", + "issue", + "volume", + "date", + "accessed_date", + "issued", + "page", + "medium", + "collection_title", + "chapter_number", + "unstructured", + "series-title", + "volume-title", + ): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) if not ref_extra: ref_extra = None - refs.append(fatcat_openapi_client.ReleaseRef( - index=i, - # doing lookups would be a second import pass - target_release_id=None, - key=key, - year=year, - container_name=clean(ref_container_name), - title=clean(rm.get('article-title')), - locator=clean(rm.get('first-page')), - # TODO: just dump JSON somewhere here? - extra=ref_extra)) + refs.append( + fatcat_openapi_client.ReleaseRef( + index=i, + # doing lookups would be a second import pass + target_release_id=None, + key=key, + year=year, + container_name=clean(ref_container_name), + title=clean(rm.get("article-title")), + locator=clean(rm.get("first-page")), + # TODO: just dump JSON somewhere here? + extra=ref_extra, + ) + ) # abstracts abstracts = [] - abstract = clean(obj.get('abstract')) + abstract = clean(obj.get("abstract")) if abstract and len(abstract) > 10: - abstracts.append(fatcat_openapi_client.ReleaseAbstract( - mimetype="application/xml+jats", - content=abstract)) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="application/xml+jats", content=abstract + ) + ) # extra fields extra = dict() extra_crossref = dict() # top-level extra keys if not container_id: - if obj.get('container-title'): - extra['container_name'] = container_name - for key in ('group-title'): + if obj.get("container-title"): + extra["container_name"] = container_name + for key in "group-title": val = obj.get(key) if val: if type(val) == list: @@ -368,7 +439,7 @@ class CrossrefImporter(EntityImporter): else: extra[key] = val # crossref-nested extra keys - for key in ('subject', 'type', 'alternative-id', 'archive', 'funder'): + for key in ("subject", "type", "alternative-id", "archive", "funder"): val = obj.get(key) if val: if type(val) == str: @@ -376,46 +447,51 @@ class CrossrefImporter(EntityImporter): else: extra_crossref[key] = val if license_extra: - extra_crossref['license'] = license_extra + extra_crossref["license"] = license_extra - if len(obj['title']) > 1: - aliases = [clean(t) for t in obj['title'][1:]] + if len(obj["title"]) > 1: + aliases = [clean(t) for t in obj["title"][1:]] aliases = [t for t in aliases if t] if aliases: - extra['aliases'] = aliases + extra["aliases"] = aliases # ISBN isbn13 = None - for raw in obj.get('ISBN', []): + for raw in obj.get("ISBN", []): # TODO: convert if not ISBN-13 format if len(raw) == 17: isbn13 = raw break # release status - if obj['type'] in ('journal-article', 'conference-proceeding', 'book', - 'dissertation', 'book-chapter'): + if obj["type"] in ( + "journal-article", + "conference-proceeding", + "book", + "dissertation", + "book-chapter", + ): release_stage = "published" else: # unknown release_stage = None # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj['DOI'].lower()) + extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) # filter out unreasonably huge releases if len(abstracts) > 100: - self.counts['skip-huge-abstracts'] += 1 + self.counts["skip-huge-abstracts"] += 1 return None if len(contribs) > 2000: - self.counts['skip-huge-contribs'] += 1 + self.counts["skip-huge-contribs"] += 1 return None if len(refs) > 5000: - self.counts['skip-huge-refs'] += 1 + self.counts["skip-huge-refs"] += 1 return None # release date parsing is amazingly complex - raw_date = obj['issued']['date-parts'][0] + raw_date = obj["issued"]["date-parts"][0] if not raw_date or not raw_date[0]: # got some NoneType, even though at least year is supposed to be set release_year = None @@ -429,28 +505,28 @@ class CrossrefImporter(EntityImporter): release_date = None original_title: Optional[str] = None - if obj.get('original-title'): - ot = obj.get('original-title') + if obj.get("original-title"): + ot = obj.get("original-title") if ot is not None: original_title = clean(ot[0], force_xml=True) title: Optional[str] = None - if obj.get('title'): - title = clean(obj.get('title')[0], force_xml=True) + if obj.get("title"): + title = clean(obj.get("title")[0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character - self.counts['skip-blank-title'] += 1 + self.counts["skip-blank-title"] += 1 return None subtitle = None - if obj.get('subtitle'): - subtitle = clean(obj.get('subtitle')[0], force_xml=True) + if obj.get("subtitle"): + subtitle = clean(obj.get("subtitle")[0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: - extra['crossref'] = extra_crossref + extra["crossref"] = extra_crossref if not extra: extra = None @@ -466,19 +542,19 @@ class CrossrefImporter(EntityImporter): release_year=release_year, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=obj['DOI'].lower(), - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], + doi=obj["DOI"].lower(), + pmid=extids["pmid"], + pmcid=extids["pmcid"], + wikidata_qid=extids["wikidata_qid"], isbn13=isbn13, - core=extids['core_id'], - arxiv=extids['arxiv_id'], - jstor=extids['jstor_id'], + core=extids["core_id"], + arxiv=extids["arxiv_id"], + jstor=extids["jstor_id"], ), - volume=clean(obj.get('volume')), - issue=clean(obj.get('issue')), - pages=clean(obj.get('page')), - language=clean(obj.get('language')), + volume=clean(obj.get("volume")), + issue=clean(obj.get("issue")), + pages=clean(obj.get("page")), + language=clean(obj.get("language")), license_slug=license_slug, extra=extra, abstracts=abstracts, @@ -500,14 +576,17 @@ class CrossrefImporter(EntityImporter): # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False return True def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) -- cgit v1.2.3