more involved type wrangling and fixes for importers

author: Bryan Newbold <bnewbold@robocracy.org> 2021-11-03 14:01:33 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-11-03 16:46:07 -0700
commit: 36cedfde374a2643396b070d3116e4b568500e14 (patch)
tree: b199868b325897ea5dc2b065192a7eba2daf9c6b
parent: 5d29d1336afc90d3575a0379a9e9d9bdac8d1856 (diff)
download: fatcat-36cedfde374a2643396b070d3116e4b568500e14.tar.gz
fatcat-36cedfde374a2643396b070d3116e4b568500e14.zip
3 files changed, 14 insertions, 12 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 816f6ab6..a41e2bf5 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -269,18 +269,19 @@ class CrossrefImporter(EntityImporter):
                 else:
                     index = None
                 raw_affiliation = None
-                if am.get("affiliation"):
-                    if len(am.get("affiliation")) > 0:
-                        raw_affiliation = am.get("affiliation")[0]["name"]
-                    if len(am.get("affiliation")) > 1:
+                affiliation_list = am.get("affiliation") or []
+                if affiliation_list and len(affiliation_list) > 0:
+                    raw_affiliation = affiliation_list[0]["name"]
+                    if len(affiliation_list) > 1:
                         # note: affiliation => more_affiliations
                         extra["more_affiliations"] = [
-                            clean(a["name"]) for a in am.get("affiliation")[1:]
+                            clean(a["name"]) for a in affiliation_list[1:]
                         ]
                 if am.get("sequence") and am.get("sequence") != "additional":
                     extra["seq"] = clean(am.get("sequence"))
                 assert ctype in ("author", "editor", "translator")
                 raw_name = clean(raw_name)
+                # TODO: what if 'raw_name' is None?
                 contribs.append(
                     ReleaseContrib(
                         creator_id=creator_id,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 997f8dc8..6eed8991 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -376,10 +376,11 @@ class DataciteImporter(EntityImporter):
         # check for blocklisted "spam", e.g. "FULL MOVIE"
         for rule in DATACITE_TITLE_SPAM_WORDGROUPS:
             seen = set()
-            for token in rule.get("tokens", []):
+            token_list: List[str] = rule.get("tokens") or []
+            for token in token_list:
                 if token in title.lower():
                     seen.add(token)
-            if len(seen) >= rule.get("min"):
+            if len(seen) >= rule["min"]:
                 print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr)
                 return False
 
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index 3c619b14..5caed2c7 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -113,15 +113,15 @@ def lookup_cdx(
         hit = resp.content.decode("utf-8").split("\n")[0]
         if cdx_output:
             cdx_output.write(hit + "\n")
-        cdx = hit.split(" ")
-        cdx = [x if (x and x != "-") else None for x in cdx]
+        cdx_chunks = hit.split(" ")
+        cdx = [x if (x and x != "-") else None for x in cdx_chunks]
         webcapture_cdx = WebcaptureCdxLine(
             surt=cdx[0],
-            timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z",
+            timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z",
             url=cdx[2],
             mimetype=cdx[3],
-            status_code=(cdx[4] and int(cdx[4])) or None,
-            sha1=b32_hex(cdx[5]),
+            status_code=int(cdx[4] or ""),
+            sha1=b32_hex(cdx[5] or ""),
             sha256=None,
         )
         if verify_hashes:
author	Bryan Newbold <bnewbold@robocracy.org>	2021-11-03 14:01:33 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-11-03 16:46:07 -0700
commit	36cedfde374a2643396b070d3116e4b568500e14 (patch)
tree	b199868b325897ea5dc2b065192a7eba2daf9c6b
parent	5d29d1336afc90d3575a0379a9e9d9bdac8d1856 (diff)
download	fatcat-36cedfde374a2643396b070d3116e4b568500e14.tar.gz fatcat-36cedfde374a2643396b070d3116e4b568500e14.zip