aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/arabesque.py4
-rwxr-xr-xpython/fatcat_tools/importers/cdl_dash_dat.py4
-rw-r--r--python/fatcat_tools/importers/crossref.py4
-rw-r--r--python/fatcat_tools/importers/datacite.py18
-rw-r--r--python/fatcat_tools/importers/dblp_release.py14
-rw-r--r--python/fatcat_tools/importers/doaj_article.py14
-rw-r--r--python/fatcat_tools/importers/fileset_generic.py3
-rw-r--r--python/fatcat_tools/importers/grobid_metadata.py13
-rw-r--r--python/fatcat_tools/importers/ingest.py7
-rw-r--r--python/fatcat_tools/importers/jalc.py14
-rw-r--r--python/fatcat_tools/importers/jstor.py13
-rw-r--r--python/fatcat_tools/importers/matched.py14
-rw-r--r--python/fatcat_tools/importers/pubmed.py24
-rwxr-xr-xpython/fatcat_tools/importers/wayback_static.py23
14 files changed, 82 insertions, 87 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py
index 2fb7be55..b4a4d9ed 100644
--- a/python/fatcat_tools/importers/arabesque.py
+++ b/python/fatcat_tools/importers/arabesque.py
@@ -176,11 +176,11 @@ class ArabesqueMatchImporter(EntityImporter):
]
if len(existing.urls) > SANE_MAX_URLS:
self.counts["skip-update-too-many-url"] += 1
- return None
+ return False
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
self.counts["skip-update-too-many-url"] += 1
- return None
+ return False
existing.mimetype = existing.mimetype or fe.mimetype
edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing)
self._edits_inflight.append(edit)
diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py
index b88117e0..1a4114a0 100755
--- a/python/fatcat_tools/importers/cdl_dash_dat.py
+++ b/python/fatcat_tools/importers/cdl_dash_dat.py
@@ -96,8 +96,6 @@ def cdl_dash_release(
ReleaseAbstract(mimetype="text/html", content=clean(desc["value"]))
)
# print(abstracts)
- if not abstracts:
- abstracts = None
contribs = []
for creator in meta["creator"]:
@@ -123,7 +121,7 @@ def cdl_dash_release(
release_type="dataset",
license_slug=license_slug,
contribs=contribs,
- abstracts=abstracts,
+ abstracts=abstracts or None,
extra=extra,
)
return r
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 689989d2..816f6ab6 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -279,8 +279,6 @@ class CrossrefImporter(EntityImporter):
]
if am.get("sequence") and am.get("sequence") != "additional":
extra["seq"] = clean(am.get("sequence"))
- if not extra:
- extra = None
assert ctype in ("author", "editor", "translator")
raw_name = clean(raw_name)
contribs.append(
@@ -292,7 +290,7 @@ class CrossrefImporter(EntityImporter):
surname=clean(am.get("family")),
raw_affiliation=clean(raw_affiliation),
role=ctype,
- extra=extra,
+ extra=extra or None,
)
)
return contribs
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 7cc5fa20..997f8dc8 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -314,7 +314,7 @@ class DataciteImporter(EntityImporter):
if not doi:
print("skipping record without a DOI", file=sys.stderr)
- return
+ return None
if not str.isascii(doi):
print("[{}] skipping non-ascii doi for now".format(doi))
@@ -455,9 +455,11 @@ class DataciteImporter(EntityImporter):
container_type = CONTAINER_TYPE_MAP.get(container["type"])
if container.get("identifier") and container.get("identifierType") == "ISSN":
issn = container.get("identifier")
- if len(issn) == 8:
+ if issn and len(issn) == 8:
issn = issn[:4] + "-" + issn[4:]
- issnl = self.issn2issnl(issn)
+ issnl = self.issn2issnl(issn)
+ else:
+ issnl = None
if issnl is not None:
container_id = self.lookup_issnl(issnl)
@@ -620,12 +622,10 @@ class DataciteImporter(EntityImporter):
ref_extra = dict()
if rel.get("relatedIdentifierType", "") == "DOI":
ref_extra["doi"] = rel.get("relatedIdentifier")
- if not ref_extra:
- ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=ref_index,
- extra=ref_extra,
+ extra=ref_extra or None,
)
)
ref_index += 1
@@ -651,7 +651,7 @@ class DataciteImporter(EntityImporter):
extra_datacite["metadataVersion"] = metadata_version
# Include resource types.
- types = attributes.get("types", {}) or {}
+ types = attributes.get("types") or {}
resource_type = types.get("resourceType", "") or ""
resource_type_general = types.get("resourceTypeGeneral", "") or ""
@@ -1296,7 +1296,9 @@ def parse_datacite_dates(
if release_date is None and release_year is None:
continue
- if release_year < 1000 or release_year > today.year + 5:
+ if release_year is not None and (
+ release_year < 1000 or release_year > today.year + 5
+ ):
# Skip possibly bogus dates.
release_year = None
continue
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index cb56432a..5f78ca3a 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -227,9 +227,9 @@ class DblpReleaseImporter(EntityImporter):
volume = clean_str(xml_elem.volume and xml_elem.volume.text)
issue = clean_str(xml_elem.number and xml_elem.number.text)
pages = clean_str(xml_elem.pages and xml_elem.pages.text)
- release_year = clean_str(xml_elem.year and xml_elem.year.text)
- if release_year and release_year.isdigit():
- release_year = int(release_year)
+ release_year_str = clean_str(xml_elem.year and xml_elem.year.text)
+ if release_year_str and release_year_str.isdigit():
+ release_year: Optional[int] = int(release_year_str)
else:
release_year = None
release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text))
@@ -243,7 +243,7 @@ class DblpReleaseImporter(EntityImporter):
release_month = None
release_year = None
- contribs = self.dblp_contribs(xml_elem or [])
+ contribs = self.dblp_contribs(xml_elem)
ext_ids = self.dblp_ext_ids(xml_elem, dblp_key)
if isbn:
ext_ids.isbn13 = isbn
@@ -281,8 +281,6 @@ class DblpReleaseImporter(EntityImporter):
if dblp_extra:
extra["dblp"] = dblp_extra
- if not extra:
- extra = None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -295,11 +293,11 @@ class DblpReleaseImporter(EntityImporter):
# release_date,
publisher=publisher,
ext_ids=ext_ids,
- contribs=contribs,
+ contribs=contribs or None,
volume=volume,
issue=issue,
pages=pages,
- extra=extra,
+ extra=extra or None,
)
re = self.biblio_hacks(re)
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index 9ff4f3fb..f5c886a2 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -97,7 +97,7 @@ class DoajArticleImporter(EntityImporter):
for issn in bibjson["journal"]["issns"]:
issnl = self.issn2issnl(issn)
if issnl:
- container_id = self.lookup_issnl(self.issn2issnl(issn))
+ container_id = self.lookup_issnl(issnl)
if container_id:
# don't store container_name when we have an exact match
container_name = None
@@ -145,8 +145,8 @@ class DoajArticleImporter(EntityImporter):
doaj_article_id = obj["id"].lower()
ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id)
- abstracts = self.doaj_abstracts(bibjson)
- contribs = self.doaj_contribs(bibjson.get("author") or [])
+ abstracts = self.doaj_abstracts(bibjson) or []
+ contribs = self.doaj_contribs(bibjson.get("author") or []) or []
# DOAJ-specific extra
doaj_extra: Dict[str, Any] = dict()
@@ -169,8 +169,6 @@ class DoajArticleImporter(EntityImporter):
if doaj_extra:
extra["doaj"] = doaj_extra
- if not extra:
- extra = None
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
@@ -182,13 +180,13 @@ class DoajArticleImporter(EntityImporter):
# release_date,
publisher=publisher,
ext_ids=ext_ids,
- contribs=contribs,
+ contribs=contribs or None,
volume=volume,
issue=issue,
pages=pages,
language=language,
- abstracts=abstracts,
- extra=extra,
+ abstracts=abstracts or None,
+ extra=extra or None,
license_slug=license_slug,
)
re = self.biblio_hacks(re)
diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py
index 2207b938..d0c8b221 100644
--- a/python/fatcat_tools/importers/fileset_generic.py
+++ b/python/fatcat_tools/importers/fileset_generic.py
@@ -43,7 +43,8 @@ class FilesetImporter(EntityImporter):
self.counts["skip-no-files"] += 1
return False
- for f in row.get("manifest"):
+ manifest: List[Dict[str, Any]] = row.get("manifest") or []
+ for f in manifest:
for k in ("sha1", "md5"):
if not f.get(k):
self.counts["skip-missing-file-field"] += 1
diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py
index 830c9bbb..e36e1b48 100644
--- a/python/fatcat_tools/importers/grobid_metadata.py
+++ b/python/fatcat_tools/importers/grobid_metadata.py
@@ -88,7 +88,7 @@ class GrobidMetadataImporter(EntityImporter):
)
abstracts = [abobj]
else:
- abstracts = None
+ abstracts = []
contribs = []
for i, a in enumerate(obj.get("authors", [])):
@@ -118,14 +118,12 @@ class GrobidMetadataImporter(EntityImporter):
if raw.get("authors"):
cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]]
- if not cite_extra:
- cite_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
key=clean(raw.get("id")),
year=year,
title=clean(raw["title"]),
- extra=cite_extra,
+ extra=cite_extra or None,
)
)
@@ -147,12 +145,11 @@ class GrobidMetadataImporter(EntityImporter):
extra["grobid"] = extra_grobid
if self.longtail_oa:
extra["longtail_oa"] = True
- if not extra:
- extra = None
- title = clean(obj["title"], force_xml=True)
- if not title or len(title) < 2:
+ clean_title = clean(obj["title"], force_xml=True)
+ if not clean_title or len(clean_title) < 2:
return None
+ title = clean_title
re = fatcat_openapi_client.ReleaseEntity(
title=title,
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index e13ce4bd..4f1cc3c4 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -642,15 +642,16 @@ class IngestFilesetResultImporter(IngestFileResultImporter):
def want_fileset(self, row: Dict[str, Any]) -> bool:
- if not row.get("manifest") or len(row.get("manifest")) == 0:
+ manifest: Optional[List[Any]] = row.get("manifest")
+ if not manifest or len(manifest) == 0:
self.counts["skip-empty-manifest"] += 1
return False
- if len(row.get("manifest")) == 1:
+ if len(manifest) == 1:
self.counts["skip-single-file"] += 1
return False
- if len(row.get("manifest")) > self.max_file_count:
+ if len(manifest) > self.max_file_count:
self.counts["skip-too-many-files"] += 1
return False
diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py
index f540c264..2f10e533 100644
--- a/python/fatcat_tools/importers/jalc.py
+++ b/python/fatcat_tools/importers/jalc.py
@@ -193,6 +193,9 @@ class JalcImporter(EntityImporter):
doi = None
if record.doi:
doi = clean_doi(record.doi.string.strip().lower())
+ # TODO: following code is redundant with clean_doi()
+ if not doi:
+ return None
if doi.startswith("http://dx.doi.org/"):
doi = doi.replace("http://dx.doi.org/", "")
elif doi.startswith("https://dx.doi.org/"):
@@ -220,11 +223,11 @@ class JalcImporter(EntityImporter):
if date:
date = date.string
if len(date) == 10:
- release_date = datetime.datetime.strptime(
+ release_date_date = datetime.datetime.strptime(
date["completed-date"], DATE_FMT
).date()
- release_year = release_date.year
- release_date = release_date.isoformat()
+ release_year = release_date_date.year
+ release_date = release_date_date.isoformat()
elif len(date) == 4 and date.isdigit():
release_year = int(date)
@@ -252,7 +255,10 @@ class JalcImporter(EntityImporter):
# if we wanted the other ISSNs, would also need to uniq the list.
# But we only need one to lookup ISSN-L/container
issn = issn_list[0].string
- issnl = self.issn2issnl(issn)
+ if issn:
+ issnl = self.issn2issnl(issn)
+ else:
+ issnl = None
container_id = None
if issnl:
container_id = self.lookup_issnl(issnl)
diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py
index 0a6eec65..2c8aa0a4 100644
--- a/python/fatcat_tools/importers/jstor.py
+++ b/python/fatcat_tools/importers/jstor.py
@@ -52,6 +52,8 @@ class JstorImporter(EntityImporter):
self.read_issn_map_file(issn_map_file)
def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]:
+ if not crossref_type:
+ return None
return CONTAINER_TYPE_MAP.get(crossref_type)
def want(self, raw_record: Any) -> bool:
@@ -75,7 +77,12 @@ class JstorImporter(EntityImporter):
elif title and not title.get_text():
title = None
- if not title and release_type.startswith("review") and article_meta.product.source:
+ if (
+ not title
+ and release_type
+ and release_type.startswith("review")
+ and article_meta.product.source
+ ):
title = "Review: {}".format(
article_meta.product.source.replace("\n", " ").get_text()
)
@@ -240,8 +247,6 @@ class JstorImporter(EntityImporter):
# pubmed: retraction refs
if extra_jstor:
extra["jstor"] = extra_jstor
- if not extra:
- extra = None
re = fatcat_openapi_client.ReleaseEntity(
# work_id
@@ -270,7 +275,7 @@ class JstorImporter(EntityImporter):
# name, type, publisher, issnl
# extra: issnp, issne, original_name, languages, country
container_id=container_id,
- extra=extra,
+ extra=extra or None,
)
return re
diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py
index 9c80dd72..70290d81 100644
--- a/python/fatcat_tools/importers/matched.py
+++ b/python/fatcat_tools/importers/matched.py
@@ -102,20 +102,20 @@ class MatchedImporter(EntityImporter):
return None
# parse URLs and CDX
- urls = set()
+ urls_set = set()
for url in obj.get("urls", []):
url = make_rel_url(url, default_link_rel=self.default_link_rel)
if url is not None:
- urls.add(url)
+ urls_set.add(url)
for cdx in obj.get("cdx", []):
original = cdx["url"]
if cdx.get("dt"):
wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original)
- urls.add(("webarchive", wayback))
+ urls_set.add(("webarchive", wayback))
url = make_rel_url(original, default_link_rel=self.default_link_rel)
if url is not None:
- urls.add(url)
- urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls]
+ urls_set.add(url)
+ urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls_set]
if len(urls) == 0:
self.counts["skip-no-urls"] += 1
return None
@@ -195,11 +195,11 @@ class MatchedImporter(EntityImporter):
if len(existing.urls) > SANE_MAX_URLS:
self.counts["skip-update-too-many-url"] += 1
- return None
+ return False
existing.release_ids = list(set(fe.release_ids + existing.release_ids))
if len(existing.release_ids) > SANE_MAX_RELEASES:
self.counts["skip-update-too-many-releases"] += 1
- return None
+ return False
existing.mimetype = existing.mimetype or fe.mimetype
existing.size = existing.size or fe.size
existing.md5 = existing.md5 or fe.md5
diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py
index 41268925..d32fcefa 100644
--- a/python/fatcat_tools/importers/pubmed.py
+++ b/python/fatcat_tools/importers/pubmed.py
@@ -495,12 +495,12 @@ class PubmedImporter(EntityImporter):
release_year = int(pub_date.Year.string)
if pub_date.find("Day") and pub_date.find("Month"):
try:
- release_date = datetime.date(
+ release_date_date = datetime.date(
release_year,
MONTH_ABBR_MAP[pub_date.Month.string],
int(pub_date.Day.string),
)
- release_date = release_date.isoformat()
+ release_date = release_date_date.isoformat()
except ValueError as ve:
print("bad date, skipping: {}".format(ve), file=sys.stderr)
release_date = None
@@ -595,8 +595,6 @@ class PubmedImporter(EntityImporter):
)
if abst.content:
abstracts.append(abst)
- if not abstracts:
- abstracts = None
### Contribs
contribs = []
@@ -663,8 +661,6 @@ class PubmedImporter(EntityImporter):
for i, contrib in enumerate(contribs):
if contrib.raw_name != "et al.":
contrib.index = i
- if not contribs:
- contribs = None
### References
refs = []
@@ -692,16 +688,12 @@ class PubmedImporter(EntityImporter):
ref_raw = ref.Citation
if ref_raw:
ref_extra["unstructured"] = ref_raw.get_text()
- if not ref_extra:
- ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
target_release_id=ref_release_id,
- extra=ref_extra,
+ extra=ref_extra or None,
)
)
- if not refs:
- refs = None
# extra:
# translation_of
@@ -711,8 +703,6 @@ class PubmedImporter(EntityImporter):
# pubmed: retraction refs
if extra_pubmed:
extra["pubmed"] = extra_pubmed
- if not extra:
- extra = None
title = clean(title)
if not title:
@@ -739,11 +729,11 @@ class PubmedImporter(EntityImporter):
# publisher # not included?
language=language,
# license_slug # not in MEDLINE
- abstracts=abstracts,
- contribs=contribs,
- refs=refs,
+ abstracts=abstracts or None,
+ contribs=contribs or None,
+ refs=refs or None,
container_id=container_id,
- extra=extra,
+ extra=extra or None,
)
return re
diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py
index f9ee29c9..3c619b14 100755
--- a/python/fatcat_tools/importers/wayback_static.py
+++ b/python/fatcat_tools/importers/wayback_static.py
@@ -89,22 +89,23 @@ def lookup_cdx(
) -> Optional[WebcaptureCdxLine]:
sys.stderr.write(embed_url + "\n")
assert embed_url.startswith("/web/")
- embed_url = embed_url.split("/")
- timestamp = embed_url[2]
+ embed_url_segments = embed_url.split("/")
+ timestamp = embed_url_segments[2]
if timestamp.endswith("_"):
timestamp = timestamp[:-3]
- url = "/".join(embed_url[3:])
+ url = "/".join(embed_url_segments[3:])
# print((timestamp, url))
+ params: Dict = dict(
+ url=url,
+ closest=timestamp,
+ sort="closest",
+ resolveRevisits="true",
+ matchType="exact",
+ limit=1,
+ )
resp = REQ_SESSION.get(
CDX_API_BASE,
- params=dict(
- url=url,
- closest=timestamp,
- sort="closest",
- resolveRevisits="true",
- matchType="exact",
- limit=1,
- ),
+ params=params,
)
resp.raise_for_status()
# print(resp.url)