From 36cedfde374a2643396b070d3116e4b568500e14 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 3 Nov 2021 14:01:33 -0700 Subject: more involved type wrangling and fixes for importers --- python/fatcat_tools/importers/crossref.py | 11 ++++++----- python/fatcat_tools/importers/datacite.py | 5 +++-- python/fatcat_tools/importers/wayback_static.py | 10 +++++----- 3 files changed, 14 insertions(+), 12 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 816f6ab6..a41e2bf5 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -269,18 +269,19 @@ class CrossrefImporter(EntityImporter): else: index = None raw_affiliation = None - if am.get("affiliation"): - if len(am.get("affiliation")) > 0: - raw_affiliation = am.get("affiliation")[0]["name"] - if len(am.get("affiliation")) > 1: + affiliation_list = am.get("affiliation") or [] + if affiliation_list and len(affiliation_list) > 0: + raw_affiliation = affiliation_list[0]["name"] + if len(affiliation_list) > 1: # note: affiliation => more_affiliations extra["more_affiliations"] = [ - clean(a["name"]) for a in am.get("affiliation")[1:] + clean(a["name"]) for a in affiliation_list[1:] ] if am.get("sequence") and am.get("sequence") != "additional": extra["seq"] = clean(am.get("sequence")) assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) + # TODO: what if 'raw_name' is None? contribs.append( ReleaseContrib( creator_id=creator_id, diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 997f8dc8..6eed8991 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -376,10 +376,11 @@ class DataciteImporter(EntityImporter): # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() - for token in rule.get("tokens", []): + token_list: List[str] = rule.get("tokens") or [] + for token in token_list: if token in title.lower(): seen.add(token) - if len(seen) >= rule.get("min"): + if len(seen) >= rule["min"]: print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) return False diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 3c619b14..5caed2c7 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -113,15 +113,15 @@ def lookup_cdx( hit = resp.content.decode("utf-8").split("\n")[0] if cdx_output: cdx_output.write(hit + "\n") - cdx = hit.split(" ") - cdx = [x if (x and x != "-") else None for x in cdx] + cdx_chunks = hit.split(" ") + cdx = [x if (x and x != "-") else None for x in cdx_chunks] webcapture_cdx = WebcaptureCdxLine( surt=cdx[0], - timestamp=parse_wbm_timestamp(cdx[1]).isoformat() + "Z", + timestamp=parse_wbm_timestamp(cdx[1] or "").isoformat() + "Z", url=cdx[2], mimetype=cdx[3], - status_code=(cdx[4] and int(cdx[4])) or None, - sha1=b32_hex(cdx[5]), + status_code=int(cdx[4] or ""), + sha1=b32_hex(cdx[5] or ""), sha256=None, ) if verify_hashes: -- cgit v1.2.3