From 31d1a6a713d177990609767d508209ced19ca396 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 2 Nov 2021 18:14:59 -0700 Subject: fmt (black): fatcat_tools/ --- python/fatcat_tools/importers/doaj_article.py | 178 ++++++++++++++------------ 1 file changed, 96 insertions(+), 82 deletions(-) (limited to 'python/fatcat_tools/importers/doaj_article.py') diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 1831c4cd..cd063337 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048 class DoajArticleImporter(EntityImporter): - - def __init__(self, - api, - issn_map_file, - **kwargs): + def __init__(self, api, issn_map_file, **kwargs): eg_desc = kwargs.get( - 'editgroup_description', - "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps" + "editgroup_description", + "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps", ) - eg_extra = kwargs.get('editgroup_extra', dict()) - eg_extra['agent'] = eg_extra.get('agent', - 'fatcat_tools.DoajArticleImporter') + eg_extra = kwargs.get("editgroup_extra", dict()) + eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter") # ensure default is to not do updates with this worker (override super() default) - kwargs['do_updates'] = kwargs.get("do_updates", False) - super().__init__(api, - issn_map_file=issn_map_file, - editgroup_description=eg_desc, - editgroup_extra=eg_extra, - **kwargs) + kwargs["do_updates"] = kwargs.get("do_updates", False) + super().__init__( + api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs, + ) self.this_year = datetime.datetime.now().year self.read_issn_map_file(issn_map_file) @@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter): } """ - if not obj or not isinstance(obj, dict) or 'bibjson' not in obj: - self.counts['skip-empty'] += 1 + if not obj or not isinstance(obj, dict) or "bibjson" not in obj: + self.counts["skip-empty"] += 1 return None - bibjson = obj['bibjson'] + bibjson = obj["bibjson"] - title = clean_str(bibjson.get('title'), force_xml=True) + title = clean_str(bibjson.get("title"), force_xml=True) if not title: - self.counts['skip-title'] += 1 + self.counts["skip-title"] += 1 return False - container_name = clean_str(bibjson['journal']['title']) + container_name = clean_str(bibjson["journal"]["title"]) container_id = None # NOTE: 'issns' not documented in API schema - for issn in bibjson['journal']['issns']: + for issn in bibjson["journal"]["issns"]: issnl = self.issn2issnl(issn) if issnl: container_id = self.lookup_issnl(self.issn2issnl(issn)) @@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter): container_name = None break - volume = clean_str(bibjson['journal'].get('volume')) + volume = clean_str(bibjson["journal"].get("volume")) # NOTE: this schema seems to use "number" as "issue number" - issue = clean_str(bibjson['journal'].get('number')) - publisher = clean_str(bibjson['journal'].get('publisher')) + issue = clean_str(bibjson["journal"].get("number")) + publisher = clean_str(bibjson["journal"].get("publisher")) try: - release_year = int(bibjson.get('year')) + release_year = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None - release_month = parse_month(clean_str(bibjson.get('month'))) + release_month = parse_month(clean_str(bibjson.get("month"))) # block bogus far-future years/dates - if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): + if release_year is not None and ( + release_year > (self.this_year + 5) or release_year < 1000 + ): release_month = None release_year = None - license_slug = self.doaj_license_slug(bibjson['journal'].get('license')) - country = parse_country_name(bibjson['journal'].get('country')) + license_slug = self.doaj_license_slug(bibjson["journal"].get("license")) + country = parse_country_name(bibjson["journal"].get("country")) language = None - for raw in bibjson['journal'].get('language') or []: + for raw in bibjson["journal"].get("language") or []: language = parse_lang_name(raw) if language: break # pages # NOTE: error in API docs? seems like start_page not under 'journal' object - start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page')) - end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page')) + start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str( + bibjson.get("start_page") + ) + end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str( + bibjson.get("end_page") + ) pages: Optional[str] = None if start_page and end_page: pages = f"{start_page}-{end_page}" elif start_page: pages = start_page - doaj_article_id = obj['id'].lower() - ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id) + doaj_article_id = obj["id"].lower() + ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id) abstracts = self.doaj_abstracts(bibjson) - contribs = self.doaj_contribs(bibjson.get('author') or []) + contribs = self.doaj_contribs(bibjson.get("author") or []) # DOAJ-specific extra doaj_extra = dict() - if bibjson.get('subject'): - doaj_extra['subject'] = bibjson.get('subject') - if bibjson.get('keywords'): - doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k] + if bibjson.get("subject"): + doaj_extra["subject"] = bibjson.get("subject") + if bibjson.get("keywords"): + doaj_extra["keywords"] = [ + k for k in [clean_str(s) for s in bibjson.get("keywords")] if k + ] # generic extra extra = dict() if country: - extra['country'] = country + extra["country"] = country if not container_id and container_name: - extra['container_name'] = container_name + extra["container_name"] = container_name if release_year and release_month: # TODO: schema migration - extra['release_month'] = release_month + extra["release_month"] = release_month if doaj_extra: - extra['doaj'] = doaj_extra + extra["doaj"] = doaj_extra if not extra: extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, - release_type='article-journal', - release_stage='published', + release_type="article-journal", + release_stage="published", title=title, release_year=release_year, - #release_date, + # release_date, publisher=publisher, ext_ids=ext_ids, contribs=contribs, @@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter): # then try other ext_id lookups if not existing: - for extid_type in ('doi', 'pmid', 'pmcid'): + for extid_type in ("doi", "pmid", "pmcid"): extid_val = getattr(re.ext_ids, extid_type) if not extid_val: continue - #print(f" lookup release type: {extid_type} val: {extid_val}") + # print(f" lookup release type: {extid_type} val: {extid_val}") try: existing = self.api.lookup_release(**{extid_type: extid_val}) except fatcat_openapi_client.rest.ApiException as err: @@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter): # other logic could go here about skipping updates if not self.do_updates or existing.ext_ids.doaj: - self.counts['exists'] += 1 + self.counts["exists"] += 1 return False # fields to copy over for update @@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter): existing.release_stage = existing.release_stage or re.release_stage existing.container_id = existing.container_id or re.container_id existing.abstracts = existing.abstracts or re.abstracts - existing.extra['doaj'] = re.extra['doaj'] + existing.extra["doaj"] = re.extra["doaj"] existing.volume = existing.volume or re.volume existing.issue = existing.issue or re.issue existing.pages = existing.pages or re.pages @@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter): try: self.api.update_release(self.get_editgroup_id(), existing.ident, existing) - self.counts['update'] += 1 + self.counts["update"] += 1 except fatcat_openapi_client.rest.ApiException as err: # there is a code path where we try to update the same release # twice in a row; if that happens, just skip # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: - self.counts['skip-update-conflict'] += 1 + self.counts["skip-update-conflict"] += 1 return False else: raise err @@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter): return False def insert_batch(self, batch): - self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( - editgroup=fatcat_openapi_client.Editgroup( - description=self.editgroup_description, - extra=self.editgroup_extra), - entity_list=batch)) + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, extra=self.editgroup_extra + ), + entity_list=batch, + ) + ) def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]: - text = clean_str(bibjson.get('abstract')) + text = clean_str(bibjson.get("abstract")) if not text or len(text) < 10: return [] if len(text) > MAX_ABSTRACT_LENGTH: @@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter): lang=lang, ) - return [abstract,] + return [ + abstract, + ] def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: """ @@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter): contribs = [] index = 0 for author in authors: - if not author.get('name'): + if not author.get("name"): continue creator_id = None - orcid = clean_orcid(author.get('orcid_id')) + orcid = clean_orcid(author.get("orcid_id")) if orcid: creator_id = self.lookup_orcid(orcid) - contribs.append(fatcat_openapi_client.ReleaseContrib( - raw_name=author.get('name'), - role='author', - index=index, - creator_id=creator_id, - raw_affiliation=clean_str(author.get('affiliation')), - )) + contribs.append( + fatcat_openapi_client.ReleaseContrib( + raw_name=author.get("name"), + role="author", + index=index, + creator_id=creator_id, + raw_affiliation=clean_str(author.get("affiliation")), + ) + ) index += 1 return contribs - def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds: + def doaj_ext_ids( + self, identifiers: List[dict], doaj_article_id: str + ) -> fatcat_openapi_client.ReleaseExtIds: """ bibjson.identifier { id (string), @@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter): pmid: Optional[str] = None pmcid: Optional[str] = None for id_obj in identifiers: - if not id_obj.get('id'): + if not id_obj.get("id"): continue - if id_obj['type'].lower() == 'doi': - doi = clean_doi(id_obj['id']) - elif id_obj['type'].lower() == 'pmid': - pmid = clean_pmid(id_obj['id']) - elif id_obj['type'].lower() == 'pmcid': - pmcid = clean_pmcid(id_obj['id']) + if id_obj["type"].lower() == "doi": + doi = clean_doi(id_obj["id"]) + elif id_obj["type"].lower() == "pmid": + pmid = clean_pmid(id_obj["id"]) + elif id_obj["type"].lower() == "pmcid": + pmcid = clean_pmcid(id_obj["id"]) return fatcat_openapi_client.ReleaseExtIds( doaj=doaj_article_id, @@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter): if not license_list: return None for license in license_list: - if not license.get('open_access'): + if not license.get("open_access"): continue - slug = license.get('type') - if slug.startswith('CC '): - slug = slug.replace('CC ', 'cc-').lower() + slug = license.get("type") + if slug.startswith("CC "): + slug = slug.replace("CC ", "cc-").lower() return slug return None -- cgit v1.2.3