diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 26 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 21 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 2 |
3 files changed, 27 insertions, 22 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 606d4bb1..d0017002 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -393,8 +393,6 @@ class CrossrefImporter(EntityImporter): ): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) - if not ref_extra: - ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=i, @@ -406,7 +404,7 @@ class CrossrefImporter(EntityImporter): title=clean(rm.get("article-title")), locator=clean(rm.get("first-page")), # TODO: just dump JSON somewhere here? - extra=ref_extra, + extra=ref_extra or None, ) ) @@ -421,8 +419,8 @@ class CrossrefImporter(EntityImporter): ) # extra fields - extra = dict() - extra_crossref = dict() + extra: Dict[str, Any] = dict() + extra_crossref: Dict[str, Any] = dict() # top-level extra keys if not container_id: if obj.get("container-title"): @@ -471,13 +469,13 @@ class CrossrefImporter(EntityImporter): "dissertation", "book-chapter", ): - release_stage = "published" + release_stage: Optional[str] = "published" else: # unknown release_stage = None # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) + extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} # filter out unreasonably huge releases if len(abstracts) > 100: @@ -512,7 +510,7 @@ class CrossrefImporter(EntityImporter): title: Optional[str] = None if obj.get("title"): - title = clean(obj.get("title")[0], force_xml=True) + title = clean(obj["title"][0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts["skip-blank-title"] += 1 @@ -520,15 +518,13 @@ class CrossrefImporter(EntityImporter): subtitle = None if obj.get("subtitle"): - subtitle = clean(obj.get("subtitle")[0], force_xml=True) + subtitle = clean(obj["subtitle"][0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: extra["crossref"] = extra_crossref - if not extra: - extra = None re = ReleaseEntity( work_id=None, @@ -556,10 +552,10 @@ class CrossrefImporter(EntityImporter): pages=clean(obj.get("page")), language=clean(obj.get("language")), license_slug=license_slug, - extra=extra, - abstracts=abstracts, - contribs=contribs, - refs=refs, + extra=extra or None, + abstracts=abstracts or None, + contribs=contribs or None, + refs=refs or None, ) return re diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5baa6cd6..e73e5f33 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -26,6 +26,7 @@ import sys # noqa: F401 import warnings from typing import Any, List, Optional +import bs4 import fatcat_openapi_client from fatcat_tools.importers.common import EntityImporter @@ -420,7 +421,9 @@ class DblpReleaseImporter(EntityImporter): ) ) - def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: + def dblp_contribs( + self, elem: bs4.element.Tag + ) -> List[fatcat_openapi_client.ReleaseContrib]: """ - author (multiple; each a single string) => may have HTML entities @@ -431,21 +434,23 @@ class DblpReleaseImporter(EntityImporter): """ contribs = [] index = 0 - for elem in authors.find_all("author"): + for elem in elem.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 - for elem in authors.find_all("editor"): + for elem in elem.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) return contribs - def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib: + def dblp_contrib_single( + self, elem: bs4.element.Tag + ) -> fatcat_openapi_client.ReleaseContrib: """ In the future, might try to implement creator key-ificiation and lookup here. @@ -461,11 +466,15 @@ class DblpReleaseImporter(EntityImporter): raw_name = clean_str(elem.text) # remove number in author name, if present - if raw_name.split()[-1].isdigit(): + if raw_name and raw_name.split()[-1].isdigit(): raw_name = " ".join(raw_name.split()[:-1]) if elem.get("orcid"): - orcid = clean_orcid(elem["orcid"]) + orcid_val = elem["orcid"] + if isinstance(orcid_val, list): + orcid = clean_orcid(orcid_val[0]) + else: + orcid = clean_orcid(orcid_val) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index cd063337..56045ea7 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -382,7 +382,7 @@ class DoajArticleImporter(EntityImporter): if not license.get("open_access"): continue slug = license.get("type") - if slug.startswith("CC "): + if slug and slug.startswith("CC "): slug = slug.replace("CC ", "cc-").lower() return slug return None |