aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-03 14:01:33 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-11-03 16:46:07 -0700
commit36cedfde374a2643396b070d3116e4b568500e14 (patch)
treeb199868b325897ea5dc2b065192a7eba2daf9c6b /python/fatcat_tools
parent5d29d1336afc90d3575a0379a9e9d9bdac8d1856 (diff)
downloadfatcat-36cedfde374a2643396b070d3116e4b568500e14.tar.gz
fatcat-36cedfde374a2643396b070d3116e4b568500e14.zip
more involved type wrangling and fixes for importers
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/crossref.py11
-rw-r--r--python/fatcat_tools/importers/datacite.py5
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py10
3 files changed, 14 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 816f6ab6..a41e2bf5 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -269,18 +269,19 @@ class CrossrefImporter(EntityImporter):
else:
index = None
raw_affiliation = None
- if am.get("affiliation"):
- if len(am.get("affiliation")) > 0:
- raw_affiliation = am.get("affiliation")[0]["name"]
- if len(am.get("affiliation")) > 1:
+ affiliation_list = am.get("affiliation") or []
+ if affiliation_list and len(affiliation_list) > 0:
+ raw_affiliation = affiliation_list[0]["name"]
+ if len(affiliation_list) > 1:
# note: affiliation => more_affiliations
extra["more_affiliations"] = [
- clean(a["name"]) for a in am.get("affiliation")[1:]
+ clean(a["name"]) for a in affiliation_list[1:]
]
if am.get("sequence") and am.get("sequence") != "additional":
extra["seq"] = clean(am.get("sequence"))
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
+ # TODO: what if 'raw_name' is None?
contribs.append(
ReleaseContrib(
creator_id=creator_id,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 997f8dc8..6eed8991 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -376,10 +376,11 @@ class DataciteImporter(EntityImporter):
# check for blocklisted "spam", e.g. "FULL MOVIE"
for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
seen = set()
- for token in rule.get("tokens", []):
+ token_list: List[str] = rule.get("tokens") or []
+ for token in token_list:
if token in title.lower():
seen.add(token)
- if len(seen) >= rule.get("min"):
+ if len(seen) >= rule["min"]:
print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
return False
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 3c619b14..5caed2c7 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -113,15 +113,15 @@ def lookup_cdx(
hit = resp.content.decode("utf-8").split("\n")[0]
if cdx_output:
cdx_output.write(hit + "\n")
- cdx = hit.split(" ")
- cdx = [x if (x and x != "-") else None for x in cdx]
+ cdx_chunks = hit.split(" ")
+ cdx = [x if (x and x != "-") else None for x in cdx_chunks]
webcapture_cdx = WebcaptureCdxLine(
surt=cdx[0],
- timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
+ timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
url=cdx[2],
mimetype=cdx[3],
- status_code=(cdx[4] and int(cdx[4])) or None,
- sha1=b32_hex(cdx[5]),
+ status_code=int(cdx[4] or ""),
+ sha1=b32_hex(cdx[5] or ""),
sha256=None,
)
if verify_hashes: