aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-02 19:51:48 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-02 19:51:51 -0700
commit4c77bdb8d92523935454f1c406c954913f923c01 (patch)
tree2b2a1221cc78683afb9f18a87ccfd10ef0afbc64 /python/fatcat_tools/importers
parent3da07382d682a0c474ddc79f748a50ad2cc758cd (diff)
downloadfatcat-4c77bdb8d92523935454f1c406c954913f923c01.tar.gz
fatcat-4c77bdb8d92523935454f1c406c954913f923c01.zip
lint: resolve existing mypy type errors
Adds annotations and re-workes dataflow to satisfy existing mypy issues, without adding any additional type annotations to, eg, function signatures. There will probably be many more type errors when annotations are all added.
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/crossref.py26
-rw-r--r--python/fatcat_tools/importers/dblp_release.py21
-rw-r--r--python/fatcat_tools/importers/doaj_article.py2
3 files changed, 27 insertions, 22 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 606d4bb1..d0017002 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -393,8 +393,6 @@ class CrossrefImporter(EntityImporter):
):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
- if not ref_extra:
- ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=i,
@@ -406,7 +404,7 @@ class CrossrefImporter(EntityImporter):
title=clean(rm.get("article-title")),
locator=clean(rm.get("first-page")),
# TODO: just dump JSON somewhere here?
- extra=ref_extra,
+ extra=ref_extra or None,
)
)
@@ -421,8 +419,8 @@ class CrossrefImporter(EntityImporter):
)
# extra fields
- extra = dict()
- extra_crossref = dict()
+ extra: Dict[str, Any] = dict()
+ extra_crossref: Dict[str, Any] = dict()
# top-level extra keys
if not container_id:
if obj.get("container-title"):
@@ -471,13 +469,13 @@ class CrossrefImporter(EntityImporter):
"dissertation",
"book-chapter",
):
- release_stage = "published"
+ release_stage: Optional[str] = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
# filter out unreasonably huge releases
if len(abstracts) > 100:
@@ -512,7 +510,7 @@ class CrossrefImporter(EntityImporter):
title: Optional[str] = None
if obj.get("title"):
- title = clean(obj.get("title")[0], force_xml=True)
+ title = clean(obj["title"][0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
self.counts["skip-blank-title"] += 1
@@ -520,15 +518,13 @@ class CrossrefImporter(EntityImporter):
subtitle = None
if obj.get("subtitle"):
- subtitle = clean(obj.get("subtitle")[0], force_xml=True)
+ subtitle = clean(obj["subtitle"][0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
extra["crossref"] = extra_crossref
- if not extra:
- extra = None
re = ReleaseEntity(
work_id=None,
@@ -556,10 +552,10 @@ class CrossrefImporter(EntityImporter):
pages=clean(obj.get("page")),
language=clean(obj.get("language")),
license_slug=license_slug,
- extra=extra,
- abstracts=abstracts,
- contribs=contribs,
- refs=refs,
+ extra=extra or None,
+ abstracts=abstracts or None,
+ contribs=contribs or None,
+ refs=refs or None,
)
return re
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5baa6cd6..e73e5f33 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -26,6 +26,7 @@ import sys # noqa: F401
import warnings
from typing import Any, List, Optional
+import bs4
import fatcat_openapi_client
from fatcat_tools.importers.common import EntityImporter
@@ -420,7 +421,9 @@ class DblpReleaseImporter(EntityImporter):
)
)
- def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+ def dblp_contribs(
+ self, elem: bs4.element.Tag
+ ) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
- author (multiple; each a single string)
=> may have HTML entities
@@ -431,21 +434,23 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all("author"):
+ for elem in elem.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all("editor"):
+ for elem in elem.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
return contribs
- def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
+ def dblp_contrib_single(
+ self, elem: bs4.element.Tag
+ ) -> fatcat_openapi_client.ReleaseContrib:
"""
In the future, might try to implement creator key-ificiation and lookup here.
@@ -461,11 +466,15 @@ class DblpReleaseImporter(EntityImporter):
raw_name = clean_str(elem.text)
# remove number in author name, if present
- if raw_name.split()[-1].isdigit():
+ if raw_name and raw_name.split()[-1].isdigit():
raw_name = " ".join(raw_name.split()[:-1])
if elem.get("orcid"):
- orcid = clean_orcid(elem["orcid"])
+ orcid_val = elem["orcid"]
+ if isinstance(orcid_val, list):
+ orcid = clean_orcid(orcid_val[0])
+ else:
+ orcid = clean_orcid(orcid_val)
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index cd063337..56045ea7 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -382,7 +382,7 @@ class DoajArticleImporter(EntityImporter):
if not license.get("open_access"):
continue
slug = license.get("type")
- if slug.startswith("CC "):
+ if slug and slug.startswith("CC "):
slug = slug.replace("CC ", "cc-").lower()
return slug
return None