diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/arabesque.py | 4 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/cdl_dash_dat.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 18 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/fileset_generic.py | 3 | ||||
-rw-r--r-- | python/fatcat_tools/importers/grobid_metadata.py | 13 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jalc.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/jstor.py | 13 | ||||
-rw-r--r-- | python/fatcat_tools/importers/matched.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 24 | ||||
-rwxr-xr-x | python/fatcat_tools/importers/wayback_static.py | 23 |
14 files changed, 82 insertions, 87 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index 2fb7be55..b4a4d9ed 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -176,11 +176,11 @@ class ArabesqueMatchImporter(EntityImporter): ] if len(existing.urls) > SANE_MAX_URLS: self.counts["skip-update-too-many-url"] += 1 - return None + return False existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: self.counts["skip-update-too-many-url"] += 1 - return None + return False existing.mimetype = existing.mimetype or fe.mimetype edit = self.api.update_file(self.get_editgroup_id(), existing.ident, existing) self._edits_inflight.append(edit) diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index b88117e0..1a4114a0 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -96,8 +96,6 @@ def cdl_dash_release( ReleaseAbstract(mimetype="text/html", content=clean(desc["value"])) ) # print(abstracts) - if not abstracts: - abstracts = None contribs = [] for creator in meta["creator"]: @@ -123,7 +121,7 @@ def cdl_dash_release( release_type="dataset", license_slug=license_slug, contribs=contribs, - abstracts=abstracts, + abstracts=abstracts or None, extra=extra, ) return r diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 689989d2..816f6ab6 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -279,8 +279,6 @@ class CrossrefImporter(EntityImporter): ] if am.get("sequence") and am.get("sequence") != "additional": extra["seq"] = clean(am.get("sequence")) - if not extra: - extra = None assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append( @@ -292,7 +290,7 @@ class CrossrefImporter(EntityImporter): surname=clean(am.get("family")), raw_affiliation=clean(raw_affiliation), role=ctype, - extra=extra, + extra=extra or None, ) ) return contribs diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 7cc5fa20..997f8dc8 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -314,7 +314,7 @@ class DataciteImporter(EntityImporter): if not doi: print("skipping record without a DOI", file=sys.stderr) - return + return None if not str.isascii(doi): print("[{}] skipping non-ascii doi for now".format(doi)) @@ -455,9 +455,11 @@ class DataciteImporter(EntityImporter): container_type = CONTAINER_TYPE_MAP.get(container["type"]) if container.get("identifier") and container.get("identifierType") == "ISSN": issn = container.get("identifier") - if len(issn) == 8: + if issn and len(issn) == 8: issn = issn[:4] + "-" + issn[4:] - issnl = self.issn2issnl(issn) + issnl = self.issn2issnl(issn) + else: + issnl = None if issnl is not None: container_id = self.lookup_issnl(issnl) @@ -620,12 +622,10 @@ class DataciteImporter(EntityImporter): ref_extra = dict() if rel.get("relatedIdentifierType", "") == "DOI": ref_extra["doi"] = rel.get("relatedIdentifier") - if not ref_extra: - ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, - extra=ref_extra, + extra=ref_extra or None, ) ) ref_index += 1 @@ -651,7 +651,7 @@ class DataciteImporter(EntityImporter): extra_datacite["metadataVersion"] = metadata_version # Include resource types. - types = attributes.get("types", {}) or {} + types = attributes.get("types") or {} resource_type = types.get("resourceType", "") or "" resource_type_general = types.get("resourceTypeGeneral", "") or "" @@ -1296,7 +1296,9 @@ def parse_datacite_dates( if release_date is None and release_year is None: continue - if release_year < 1000 or release_year > today.year + 5: + if release_year is not None and ( + release_year < 1000 or release_year > today.year + 5 + ): # Skip possibly bogus dates. release_year = None continue diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index cb56432a..5f78ca3a 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -227,9 +227,9 @@ class DblpReleaseImporter(EntityImporter): volume = clean_str(xml_elem.volume and xml_elem.volume.text) issue = clean_str(xml_elem.number and xml_elem.number.text) pages = clean_str(xml_elem.pages and xml_elem.pages.text) - release_year = clean_str(xml_elem.year and xml_elem.year.text) - if release_year and release_year.isdigit(): - release_year = int(release_year) + release_year_str = clean_str(xml_elem.year and xml_elem.year.text) + if release_year_str and release_year_str.isdigit(): + release_year: Optional[int] = int(release_year_str) else: release_year = None release_month = parse_month(clean_str(xml_elem.month and xml_elem.month.text)) @@ -243,7 +243,7 @@ class DblpReleaseImporter(EntityImporter): release_month = None release_year = None - contribs = self.dblp_contribs(xml_elem or []) + contribs = self.dblp_contribs(xml_elem) ext_ids = self.dblp_ext_ids(xml_elem, dblp_key) if isbn: ext_ids.isbn13 = isbn @@ -281,8 +281,6 @@ class DblpReleaseImporter(EntityImporter): if dblp_extra: extra["dblp"] = dblp_extra - if not extra: - extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -295,11 +293,11 @@ class DblpReleaseImporter(EntityImporter): # release_date, publisher=publisher, ext_ids=ext_ids, - contribs=contribs, + contribs=contribs or None, volume=volume, issue=issue, pages=pages, - extra=extra, + extra=extra or None, ) re = self.biblio_hacks(re) diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 9ff4f3fb..f5c886a2 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -97,7 +97,7 @@ class DoajArticleImporter(EntityImporter): for issn in bibjson["journal"]["issns"]: issnl = self.issn2issnl(issn) if issnl: - container_id = self.lookup_issnl(self.issn2issnl(issn)) + container_id = self.lookup_issnl(issnl) if container_id: # don't store container_name when we have an exact match container_name = None @@ -145,8 +145,8 @@ class DoajArticleImporter(EntityImporter): doaj_article_id = obj["id"].lower() ext_ids = self.doaj_ext_ids(bibjson["identifier"], doaj_article_id) - abstracts = self.doaj_abstracts(bibjson) - contribs = self.doaj_contribs(bibjson.get("author") or []) + abstracts = self.doaj_abstracts(bibjson) or [] + contribs = self.doaj_contribs(bibjson.get("author") or []) or [] # DOAJ-specific extra doaj_extra: Dict[str, Any] = dict() @@ -169,8 +169,6 @@ class DoajArticleImporter(EntityImporter): if doaj_extra: extra["doaj"] = doaj_extra - if not extra: - extra = None re = fatcat_openapi_client.ReleaseEntity( work_id=None, @@ -182,13 +180,13 @@ class DoajArticleImporter(EntityImporter): # release_date, publisher=publisher, ext_ids=ext_ids, - contribs=contribs, + contribs=contribs or None, volume=volume, issue=issue, pages=pages, language=language, - abstracts=abstracts, - extra=extra, + abstracts=abstracts or None, + extra=extra or None, license_slug=license_slug, ) re = self.biblio_hacks(re) diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index 2207b938..d0c8b221 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -43,7 +43,8 @@ class FilesetImporter(EntityImporter): self.counts["skip-no-files"] += 1 return False - for f in row.get("manifest"): + manifest: List[Dict[str, Any]] = row.get("manifest") or [] + for f in manifest: for k in ("sha1", "md5"): if not f.get(k): self.counts["skip-missing-file-field"] += 1 diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index 830c9bbb..e36e1b48 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -88,7 +88,7 @@ class GrobidMetadataImporter(EntityImporter): ) abstracts = [abobj] else: - abstracts = None + abstracts = [] contribs = [] for i, a in enumerate(obj.get("authors", [])): @@ -118,14 +118,12 @@ class GrobidMetadataImporter(EntityImporter): if raw.get("authors"): cite_extra["authors"] = [clean(a["name"]) for a in raw["authors"]] - if not cite_extra: - cite_extra = None refs.append( fatcat_openapi_client.ReleaseRef( key=clean(raw.get("id")), year=year, title=clean(raw["title"]), - extra=cite_extra, + extra=cite_extra or None, ) ) @@ -147,12 +145,11 @@ class GrobidMetadataImporter(EntityImporter): extra["grobid"] = extra_grobid if self.longtail_oa: extra["longtail_oa"] = True - if not extra: - extra = None - title = clean(obj["title"], force_xml=True) - if not title or len(title) < 2: + clean_title = clean(obj["title"], force_xml=True) + if not clean_title or len(clean_title) < 2: return None + title = clean_title re = fatcat_openapi_client.ReleaseEntity( title=title, diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index e13ce4bd..4f1cc3c4 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -642,15 +642,16 @@ class IngestFilesetResultImporter(IngestFileResultImporter): def want_fileset(self, row: Dict[str, Any]) -> bool: - if not row.get("manifest") or len(row.get("manifest")) == 0: + manifest: Optional[List[Any]] = row.get("manifest") + if not manifest or len(manifest) == 0: self.counts["skip-empty-manifest"] += 1 return False - if len(row.get("manifest")) == 1: + if len(manifest) == 1: self.counts["skip-single-file"] += 1 return False - if len(row.get("manifest")) > self.max_file_count: + if len(manifest) > self.max_file_count: self.counts["skip-too-many-files"] += 1 return False diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index f540c264..2f10e533 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -193,6 +193,9 @@ class JalcImporter(EntityImporter): doi = None if record.doi: doi = clean_doi(record.doi.string.strip().lower()) + # TODO: following code is redundant with clean_doi() + if not doi: + return None if doi.startswith("http://dx.doi.org/"): doi = doi.replace("http://dx.doi.org/", "") elif doi.startswith("https://dx.doi.org/"): @@ -220,11 +223,11 @@ class JalcImporter(EntityImporter): if date: date = date.string if len(date) == 10: - release_date = datetime.datetime.strptime( + release_date_date = datetime.datetime.strptime( date["completed-date"], DATE_FMT ).date() - release_year = release_date.year - release_date = release_date.isoformat() + release_year = release_date_date.year + release_date = release_date_date.isoformat() elif len(date) == 4 and date.isdigit(): release_year = int(date) @@ -252,7 +255,10 @@ class JalcImporter(EntityImporter): # if we wanted the other ISSNs, would also need to uniq the list. # But we only need one to lookup ISSN-L/container issn = issn_list[0].string - issnl = self.issn2issnl(issn) + if issn: + issnl = self.issn2issnl(issn) + else: + issnl = None container_id = None if issnl: container_id = self.lookup_issnl(issnl) diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 0a6eec65..2c8aa0a4 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -52,6 +52,8 @@ class JstorImporter(EntityImporter): self.read_issn_map_file(issn_map_file) def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]: + if not crossref_type: + return None return CONTAINER_TYPE_MAP.get(crossref_type) def want(self, raw_record: Any) -> bool: @@ -75,7 +77,12 @@ class JstorImporter(EntityImporter): elif title and not title.get_text(): title = None - if not title and release_type.startswith("review") and article_meta.product.source: + if ( + not title + and release_type + and release_type.startswith("review") + and article_meta.product.source + ): title = "Review: {}".format( article_meta.product.source.replace("\n", " ").get_text() ) @@ -240,8 +247,6 @@ class JstorImporter(EntityImporter): # pubmed: retraction refs if extra_jstor: extra["jstor"] = extra_jstor - if not extra: - extra = None re = fatcat_openapi_client.ReleaseEntity( # work_id @@ -270,7 +275,7 @@ class JstorImporter(EntityImporter): # name, type, publisher, issnl # extra: issnp, issne, original_name, languages, country container_id=container_id, - extra=extra, + extra=extra or None, ) return re diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 9c80dd72..70290d81 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -102,20 +102,20 @@ class MatchedImporter(EntityImporter): return None # parse URLs and CDX - urls = set() + urls_set = set() for url in obj.get("urls", []): url = make_rel_url(url, default_link_rel=self.default_link_rel) if url is not None: - urls.add(url) + urls_set.add(url) for cdx in obj.get("cdx", []): original = cdx["url"] if cdx.get("dt"): wayback = "https://web.archive.org/web/{}/{}".format(cdx["dt"], original) - urls.add(("webarchive", wayback)) + urls_set.add(("webarchive", wayback)) url = make_rel_url(original, default_link_rel=self.default_link_rel) if url is not None: - urls.add(url) - urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + urls_set.add(url) + urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls_set] if len(urls) == 0: self.counts["skip-no-urls"] += 1 return None @@ -195,11 +195,11 @@ class MatchedImporter(EntityImporter): if len(existing.urls) > SANE_MAX_URLS: self.counts["skip-update-too-many-url"] += 1 - return None + return False existing.release_ids = list(set(fe.release_ids + existing.release_ids)) if len(existing.release_ids) > SANE_MAX_RELEASES: self.counts["skip-update-too-many-releases"] += 1 - return None + return False existing.mimetype = existing.mimetype or fe.mimetype existing.size = existing.size or fe.size existing.md5 = existing.md5 or fe.md5 diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 41268925..d32fcefa 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -495,12 +495,12 @@ class PubmedImporter(EntityImporter): release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): try: - release_date = datetime.date( + release_date_date = datetime.date( release_year, MONTH_ABBR_MAP[pub_date.Month.string], int(pub_date.Day.string), ) - release_date = release_date.isoformat() + release_date = release_date_date.isoformat() except ValueError as ve: print("bad date, skipping: {}".format(ve), file=sys.stderr) release_date = None @@ -595,8 +595,6 @@ class PubmedImporter(EntityImporter): ) if abst.content: abstracts.append(abst) - if not abstracts: - abstracts = None ### Contribs contribs = [] @@ -663,8 +661,6 @@ class PubmedImporter(EntityImporter): for i, contrib in enumerate(contribs): if contrib.raw_name != "et al.": contrib.index = i - if not contribs: - contribs = None ### References refs = [] @@ -692,16 +688,12 @@ class PubmedImporter(EntityImporter): ref_raw = ref.Citation if ref_raw: ref_extra["unstructured"] = ref_raw.get_text() - if not ref_extra: - ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( target_release_id=ref_release_id, - extra=ref_extra, + extra=ref_extra or None, ) ) - if not refs: - refs = None # extra: # translation_of @@ -711,8 +703,6 @@ class PubmedImporter(EntityImporter): # pubmed: retraction refs if extra_pubmed: extra["pubmed"] = extra_pubmed - if not extra: - extra = None title = clean(title) if not title: @@ -739,11 +729,11 @@ class PubmedImporter(EntityImporter): # publisher # not included? language=language, # license_slug # not in MEDLINE - abstracts=abstracts, - contribs=contribs, - refs=refs, + abstracts=abstracts or None, + contribs=contribs or None, + refs=refs or None, container_id=container_id, - extra=extra, + extra=extra or None, ) return re diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index f9ee29c9..3c619b14 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -89,22 +89,23 @@ def lookup_cdx( ) -> Optional[WebcaptureCdxLine]: sys.stderr.write(embed_url + "\n") assert embed_url.startswith("/web/") - embed_url = embed_url.split("/") - timestamp = embed_url[2] + embed_url_segments = embed_url.split("/") + timestamp = embed_url_segments[2] if timestamp.endswith("_"): timestamp = timestamp[:-3] - url = "/".join(embed_url[3:]) + url = "/".join(embed_url_segments[3:]) # print((timestamp, url)) + params: Dict = dict( + url=url, + closest=timestamp, + sort="closest", + resolveRevisits="true", + matchType="exact", + limit=1, + ) resp = REQ_SESSION.get( CDX_API_BASE, - params=dict( - url=url, - closest=timestamp, - sort="closest", - resolveRevisits="true", - matchType="exact", - limit=1, - ), + params=params, ) resp.raise_for_status() # print(resp.url) |