aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/doaj_article.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/doaj_article.py')
-rw-r--r--python/fatcat_tools/importers/doaj_article.py178
1 files changed, 96 insertions, 82 deletions
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 1831c4cd..cd063337 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -28,26 +28,23 @@ MAX_ABSTRACT_LENGTH = 2048
class DoajArticleImporter(EntityImporter):
-
- def __init__(self,
- api,
- issn_map_file,
- **kwargs):
+ def __init__(self, api, issn_map_file, **kwargs):
eg_desc = kwargs.get(
- 'editgroup_description',
- "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps"
+ "editgroup_description",
+ "Automated import of DOAJ article metadata, harvested from REST API or bulk dumps",
)
- eg_extra = kwargs.get('editgroup_extra', dict())
- eg_extra['agent'] = eg_extra.get('agent',
- 'fatcat_tools.DoajArticleImporter')
+ eg_extra = kwargs.get("editgroup_extra", dict())
+ eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.DoajArticleImporter")
# ensure default is to not do updates with this worker (override super() default)
- kwargs['do_updates'] = kwargs.get("do_updates", False)
- super().__init__(api,
- issn_map_file=issn_map_file,
- editgroup_description=eg_desc,
- editgroup_extra=eg_extra,
- **kwargs)
+ kwargs["do_updates"] = kwargs.get("do_updates", False)
+ super().__init__(
+ api,
+ issn_map_file=issn_map_file,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs,
+ )
self.this_year = datetime.datetime.now().year
self.read_issn_map_file(issn_map_file)
@@ -82,21 +79,21 @@ class DoajArticleImporter(EntityImporter):
}
"""
- if not obj or not isinstance(obj, dict) or 'bibjson' not in obj:
- self.counts['skip-empty'] += 1
+ if not obj or not isinstance(obj, dict) or "bibjson" not in obj:
+ self.counts["skip-empty"] += 1
return None
- bibjson = obj['bibjson']
+ bibjson = obj["bibjson"]
- title = clean_str(bibjson.get('title'), force_xml=True)
+ title = clean_str(bibjson.get("title"), force_xml=True)
if not title:
- self.counts['skip-title'] += 1
+ self.counts["skip-title"] += 1
return False
- container_name = clean_str(bibjson['journal']['title'])
+ container_name = clean_str(bibjson["journal"]["title"])
container_id = None
# NOTE: 'issns' not documented in API schema
- for issn in bibjson['journal']['issns']:
+ for issn in bibjson["journal"]["issns"]:
issnl = self.issn2issnl(issn)
if issnl:
container_id = self.lookup_issnl(self.issn2issnl(issn))
@@ -105,75 +102,83 @@ class DoajArticleImporter(EntityImporter):
container_name = None
break
- volume = clean_str(bibjson['journal'].get('volume'))
+ volume = clean_str(bibjson["journal"].get("volume"))
# NOTE: this schema seems to use "number" as "issue number"
- issue = clean_str(bibjson['journal'].get('number'))
- publisher = clean_str(bibjson['journal'].get('publisher'))
+ issue = clean_str(bibjson["journal"].get("number"))
+ publisher = clean_str(bibjson["journal"].get("publisher"))
try:
- release_year = int(bibjson.get('year'))
+ release_year = int(bibjson.get("year"))
except (TypeError, ValueError):
release_year = None
- release_month = parse_month(clean_str(bibjson.get('month')))
+ release_month = parse_month(clean_str(bibjson.get("month")))
# block bogus far-future years/dates
- if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000):
+ if release_year is not None and (
+ release_year > (self.this_year + 5) or release_year < 1000
+ ):
release_month = None
release_year = None
- license_slug = self.doaj_license_slug(bibjson['journal'].get('license'))
- country = parse_country_name(bibjson['journal'].get('country'))
+ license_slug = self.doaj_license_slug(bibjson["journal"].get("license"))
+ country = parse_country_name(bibjson["journal"].get("country"))
language = None
- for raw in bibjson['journal'].get('language') or []:
+ for raw in bibjson["journal"].get("language") or []:
language = parse_lang_name(raw)
if language:
break
# pages
# NOTE: error in API docs? seems like start_page not under 'journal' object
- start_page = clean_str(bibjson['journal'].get('start_page')) or clean_str(bibjson.get('start_page'))
- end_page = clean_str(bibjson['journal'].get('end_page')) or clean_str(bibjson.get('end_page'))
+ start_page = clean_str(bibjson["journal"].get("start_page")) or clean_str(
+ bibjson.get("start_page")
+ )
+ end_page = clean_str(bibjson["journal"].get("end_page")) or clean_str(
+ bibjson.get("end_page")
+ )
pages: Optional[str] = None
if start_page and end_page:
pages = f"{start_page}-{end_page}"
elif start_page:
pages = start_page
- doaj_article_id = obj['id'].lower()
- ext_ids = self.doaj_ext_ids(bibjson['identifier'], doaj_article_id)
+ doaj_article_id = obj["id"].lower()
+ ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)
abstracts = self.doaj_abstracts(bibjson)
- contribs = self.doaj_contribs(bibjson.get('author') or [])
+ contribs = self.doaj_contribs(bibjson.get("author") or [])
# DOAJ-specific extra
doaj_extra = dict()
- if bibjson.get('subject'):
- doaj_extra['subject'] = bibjson.get('subject')
- if bibjson.get('keywords'):
- doaj_extra['keywords'] = [k for k in [clean_str(s) for s in bibjson.get('keywords')] if k]
+ if bibjson.get("subject"):
+ doaj_extra["subject"] = bibjson.get("subject")
+ if bibjson.get("keywords"):
+ doaj_extra["keywords"] = [
+ k for k in [clean_str(s) for s in bibjson.get("keywords")] if k
+ ]
# generic extra
extra = dict()
if country:
- extra['country'] = country
+ extra["country"] = country
if not container_id and container_name:
- extra['container_name'] = container_name
+ extra["container_name"] = container_name
if release_year and release_month:
# TODO: schema migration
- extra['release_month'] = release_month
+ extra["release_month"] = release_month
if doaj_extra:
- extra['doaj'] = doaj_extra
+ extra["doaj"] = doaj_extra
if not extra:
extra = None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- release_type='article-journal',
- release_stage='published',
+ release_type="article-journal",
+ release_stage="published",
title=title,
release_year=release_year,
- #release_date,
+ # release_date,
publisher=publisher,
ext_ids=ext_ids,
contribs=contribs,
@@ -208,11 +213,11 @@ class DoajArticleImporter(EntityImporter):
# then try other ext_id lookups
if not existing:
- for extid_type in ('doi', 'pmid', 'pmcid'):
+ for extid_type in ("doi", "pmid", "pmcid"):
extid_val = getattr(re.ext_ids, extid_type)
if not extid_val:
continue
- #print(f" lookup release type: {extid_type} val: {extid_val}")
+ # print(f" lookup release type: {extid_type} val: {extid_val}")
try:
existing = self.api.lookup_release(**{extid_type: extid_val})
except fatcat_openapi_client.rest.ApiException as err:
@@ -241,7 +246,7 @@ class DoajArticleImporter(EntityImporter):
# other logic could go here about skipping updates
if not self.do_updates or existing.ext_ids.doaj:
- self.counts['exists'] += 1
+ self.counts["exists"] += 1
return False
# fields to copy over for update
@@ -250,7 +255,7 @@ class DoajArticleImporter(EntityImporter):
existing.release_stage = existing.release_stage or re.release_stage
existing.container_id = existing.container_id or re.container_id
existing.abstracts = existing.abstracts or re.abstracts
- existing.extra['doaj'] = re.extra['doaj']
+ existing.extra["doaj"] = re.extra["doaj"]
existing.volume = existing.volume or re.volume
existing.issue = existing.issue or re.issue
existing.pages = existing.pages or re.pages
@@ -258,13 +263,13 @@ class DoajArticleImporter(EntityImporter):
try:
self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
- self.counts['update'] += 1
+ self.counts["update"] += 1
except fatcat_openapi_client.rest.ApiException as err:
# there is a code path where we try to update the same release
# twice in a row; if that happens, just skip
# NOTE: API behavior might change in the future?
if "release_edit_editgroup_id_ident_id_key" in err.body:
- self.counts['skip-update-conflict'] += 1
+ self.counts["skip-update-conflict"] += 1
return False
else:
raise err
@@ -272,14 +277,17 @@ class DoajArticleImporter(EntityImporter):
return False
def insert_batch(self, batch):
- self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
- editgroup=fatcat_openapi_client.Editgroup(
- description=self.editgroup_description,
- extra=self.editgroup_extra),
- entity_list=batch))
+ self.api.create_release_auto_batch(
+ fatcat_openapi_client.ReleaseAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description, extra=self.editgroup_extra
+ ),
+ entity_list=batch,
+ )
+ )
def doaj_abstracts(self, bibjson: dict) -> List[fatcat_openapi_client.ReleaseAbstract]:
- text = clean_str(bibjson.get('abstract'))
+ text = clean_str(bibjson.get("abstract"))
if not text or len(text) < 10:
return []
if len(text) > MAX_ABSTRACT_LENGTH:
@@ -293,7 +301,9 @@ class DoajArticleImporter(EntityImporter):
lang=lang,
)
- return [abstract,]
+ return [
+ abstract,
+ ]
def doaj_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
@@ -306,23 +316,27 @@ class DoajArticleImporter(EntityImporter):
contribs = []
index = 0
for author in authors:
- if not author.get('name'):
+ if not author.get("name"):
continue
creator_id = None
- orcid = clean_orcid(author.get('orcid_id'))
+ orcid = clean_orcid(author.get("orcid_id"))
if orcid:
creator_id = self.lookup_orcid(orcid)
- contribs.append(fatcat_openapi_client.ReleaseContrib(
- raw_name=author.get('name'),
- role='author',
- index=index,
- creator_id=creator_id,
- raw_affiliation=clean_str(author.get('affiliation')),
- ))
+ contribs.append(
+ fatcat_openapi_client.ReleaseContrib(
+ raw_name=author.get("name"),
+ role="author",
+ index=index,
+ creator_id=creator_id,
+ raw_affiliation=clean_str(author.get("affiliation")),
+ )
+ )
index += 1
return contribs
- def doaj_ext_ids(self, identifiers: List[dict], doaj_article_id: str) -> fatcat_openapi_client.ReleaseExtIds:
+ def doaj_ext_ids(
+ self, identifiers: List[dict], doaj_article_id: str
+ ) -> fatcat_openapi_client.ReleaseExtIds:
"""
bibjson.identifier {
id (string),
@@ -336,14 +350,14 @@ class DoajArticleImporter(EntityImporter):
pmid: Optional[str] = None
pmcid: Optional[str] = None
for id_obj in identifiers:
- if not id_obj.get('id'):
+ if not id_obj.get("id"):
continue
- if id_obj['type'].lower() == 'doi':
- doi = clean_doi(id_obj['id'])
- elif id_obj['type'].lower() == 'pmid':
- pmid = clean_pmid(id_obj['id'])
- elif id_obj['type'].lower() == 'pmcid':
- pmcid = clean_pmcid(id_obj['id'])
+ if id_obj["type"].lower() == "doi":
+ doi = clean_doi(id_obj["id"])
+ elif id_obj["type"].lower() == "pmid":
+ pmid = clean_pmid(id_obj["id"])
+ elif id_obj["type"].lower() == "pmcid":
+ pmcid = clean_pmcid(id_obj["id"])
return fatcat_openapi_client.ReleaseExtIds(
doaj=doaj_article_id,
@@ -365,10 +379,10 @@ class DoajArticleImporter(EntityImporter):
if not license_list:
return None
for license in license_list:
- if not license.get('open_access'):
+ if not license.get("open_access"):
continue
- slug = license.get('type')
- if slug.startswith('CC '):
- slug = slug.replace('CC ', 'cc-').lower()
+ slug = license.get("type")
+ if slug.startswith("CC "):
+ slug = slug.replace("CC ", "cc-").lower()
return slug
return None