diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/crossref.py | 26 | ||||
-rw-r--r-- | python/fatcat_tools/importers/dblp_release.py | 21 | ||||
-rw-r--r-- | python/fatcat_tools/importers/doaj_article.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 22 | ||||
-rw-r--r-- | python/fatcat_tools/references.py | 48 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/access.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 9 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/entities.py | 6 | ||||
-rw-r--r-- | python/fatcat_web/editing_routes.py | 3 | ||||
-rw-r--r-- | python/fatcat_web/graphics.py | 18 | ||||
-rw-r--r-- | python/fatcat_web/ref_routes.py | 33 | ||||
-rw-r--r-- | python/fatcat_web/search.py | 16 |
12 files changed, 125 insertions, 81 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index 606d4bb1..d0017002 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -393,8 +393,6 @@ class CrossrefImporter(EntityImporter): ): if clean(rm.get(k)): ref_extra[k] = clean(rm[k]) - if not ref_extra: - ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=i, @@ -406,7 +404,7 @@ class CrossrefImporter(EntityImporter): title=clean(rm.get("article-title")), locator=clean(rm.get("first-page")), # TODO: just dump JSON somewhere here? - extra=ref_extra, + extra=ref_extra or None, ) ) @@ -421,8 +419,8 @@ class CrossrefImporter(EntityImporter): ) # extra fields - extra = dict() - extra_crossref = dict() + extra: Dict[str, Any] = dict() + extra_crossref: Dict[str, Any] = dict() # top-level extra keys if not container_id: if obj.get("container-title"): @@ -471,13 +469,13 @@ class CrossrefImporter(EntityImporter): "dissertation", "book-chapter", ): - release_stage = "published" + release_stage: Optional[str] = "published" else: # unknown release_stage = None # external identifiers - extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) + extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {} # filter out unreasonably huge releases if len(abstracts) > 100: @@ -512,7 +510,7 @@ class CrossrefImporter(EntityImporter): title: Optional[str] = None if obj.get("title"): - title = clean(obj.get("title")[0], force_xml=True) + title = clean(obj["title"][0], force_xml=True) if not title or len(title) <= 1: # title can't be just a single character self.counts["skip-blank-title"] += 1 @@ -520,15 +518,13 @@ class CrossrefImporter(EntityImporter): subtitle = None if obj.get("subtitle"): - subtitle = clean(obj.get("subtitle")[0], force_xml=True) + subtitle = clean(obj["subtitle"][0], force_xml=True) if not subtitle or len(subtitle) <= 1: # subtitle can't be just a single character subtitle = None if extra_crossref: extra["crossref"] = extra_crossref - if not extra: - extra = None re = ReleaseEntity( work_id=None, @@ -556,10 +552,10 @@ class CrossrefImporter(EntityImporter): pages=clean(obj.get("page")), language=clean(obj.get("language")), license_slug=license_slug, - extra=extra, - abstracts=abstracts, - contribs=contribs, - refs=refs, + extra=extra or None, + abstracts=abstracts or None, + contribs=contribs or None, + refs=refs or None, ) return re diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 5baa6cd6..e73e5f33 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -26,6 +26,7 @@ import sys # noqa: F401 import warnings from typing import Any, List, Optional +import bs4 import fatcat_openapi_client from fatcat_tools.importers.common import EntityImporter @@ -420,7 +421,9 @@ class DblpReleaseImporter(EntityImporter): ) ) - def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]: + def dblp_contribs( + self, elem: bs4.element.Tag + ) -> List[fatcat_openapi_client.ReleaseContrib]: """ - author (multiple; each a single string) => may have HTML entities @@ -431,21 +434,23 @@ class DblpReleaseImporter(EntityImporter): """ contribs = [] index = 0 - for elem in authors.find_all("author"): + for elem in elem.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 - for elem in authors.find_all("editor"): + for elem in elem.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) return contribs - def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib: + def dblp_contrib_single( + self, elem: bs4.element.Tag + ) -> fatcat_openapi_client.ReleaseContrib: """ In the future, might try to implement creator key-ificiation and lookup here. @@ -461,11 +466,15 @@ class DblpReleaseImporter(EntityImporter): raw_name = clean_str(elem.text) # remove number in author name, if present - if raw_name.split()[-1].isdigit(): + if raw_name and raw_name.split()[-1].isdigit(): raw_name = " ".join(raw_name.split()[:-1]) if elem.get("orcid"): - orcid = clean_orcid(elem["orcid"]) + orcid_val = elem["orcid"] + if isinstance(orcid_val, list): + orcid = clean_orcid(orcid_val[0]) + else: + orcid = clean_orcid(orcid_val) if orcid: creator_id = self.lookup_orcid(orcid) if not creator_id: diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index cd063337..56045ea7 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -382,7 +382,7 @@ class DoajArticleImporter(EntityImporter): if not license.get("open_access"): continue slug = license.get("type") - if slug.startswith("CC "): + if slug and slug.startswith("CC "): slug = slug.replace("CC ", "cc-").lower() return slug return None diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 12c58829..daf47ded 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -15,7 +15,7 @@ import pycountry DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$") -def clean_doi(raw: str) -> Optional[str]: +def clean_doi(raw: Optional[str]) -> Optional[str]: """ Removes any: - padding whitespace @@ -95,7 +95,7 @@ def test_clean_doi(): ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$") -def clean_arxiv_id(raw: str) -> Optional[str]: +def clean_arxiv_id(raw: Optional[str]) -> Optional[str]: """ Removes any: - 'arxiv:' prefix @@ -170,7 +170,7 @@ def test_clean_wikidata_qid(): assert clean_wikidata_qid("") is None -def clean_pmid(raw: str) -> Optional[str]: +def clean_pmid(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip() @@ -189,7 +189,7 @@ def test_clean_pmid(): assert clean_pmid("") is None -def clean_pmcid(raw: str) -> Optional[str]: +def clean_pmcid(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip() @@ -200,7 +200,7 @@ def clean_pmcid(raw: str) -> Optional[str]: return None -def clean_sha1(raw: str) -> Optional[str]: +def clean_sha1(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip().lower() @@ -228,7 +228,9 @@ def test_clean_sha1(): assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None -def clean_sha256(raw: str) -> Optional[str]: +def clean_sha256(raw: Optional[str]) -> Optional[str]: + if not raw: + return None raw = raw.strip().lower() if len(raw.split()) != 1: return None @@ -251,7 +253,7 @@ def test_clean_sha256(): ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$") -def clean_issn(raw: str) -> Optional[str]: +def clean_issn(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip().upper() @@ -272,7 +274,7 @@ def test_clean_issn(): ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$") -def clean_isbn13(raw: str) -> Optional[str]: +def clean_isbn13(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip() @@ -291,7 +293,7 @@ def test_clean_isbn13(): ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$") -def clean_orcid(raw: str) -> Optional[str]: +def clean_orcid(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip() @@ -472,7 +474,7 @@ def test_parse_month() -> None: assert parse_month("September") == 9 -def detect_text_lang(raw: str) -> Optional[str]: +def detect_text_lang(raw: Optional[str]) -> Optional[str]: """ Tries to determine language of, eg, an abstract. diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 6fd9ca49..624020b5 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -124,7 +124,33 @@ class RefHits(BaseModel): limit: int query_time_ms: int query_wall_time_ms: int - result_refs: List[Union[BiblioRef, EnrichedBiblioRef]] + result_refs: List[BiblioRef] + + class Config: + json_encoders = { + ReleaseEntity: entity_to_dict, + } + + def as_enriched(self, enriched_refs: List[EnrichedBiblioRef]) -> "RefHitsEnriched": + return RefHitsEnriched( + count_returned=self.count_returned, + count_total=self.count_total, + offset=self.offset, + limit=self.limit, + query_time_ms=self.query_time_ms, + query_wall_time_ms=self.query_wall_time_ms, + result_refs=enriched_refs, + ) + + +class RefHitsEnriched(BaseModel): + count_returned: int + count_total: int + offset: int + limit: int + query_time_ms: int + query_wall_time_ms: int + result_refs: List[EnrichedBiblioRef] class Config: json_encoders = { @@ -221,7 +247,7 @@ def get_inbound_refs( limit: int = 25, offset: Optional[int] = None, es_index: str = "fatcat_ref", -) -> List[BiblioRef]: +) -> RefHits: search = Search(using=es_client, index=es_index) @@ -398,16 +424,16 @@ def run_ref_query(args) -> None: enriched = enrich_outbound_refs( hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client ) - for ref in enriched: - if ref.release: + for eref in enriched: + if eref.release: print( - f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" + f"{eref.ref.ref_index or '-'}\trelease_{eref.release.ident}\t{eref.ref.match_provenance}/{eref.ref.match_status}\t{eref.release.release_year or '-'}\t{eref.release.title}\t{eref.release.ext_ids.pmid or eref.release.ext_ids.doi or '-'}" ) else: - print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + print(f"{eref.ref.ref_index or '-'}\trelease_{eref.ref.target_release_ident}") else: for ref in hits.result_refs: - print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}") + print(f"{ref.ref_index or '-'}\trelease_{ref.target_release_ident}") print() print("## Inbound References") @@ -423,13 +449,13 @@ def run_ref_query(args) -> None: enriched = enrich_inbound_refs( hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client ) - for ref in enriched: - if ref.release: + for eref in enriched: + if eref.release: print( - f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}" + f"release_{eref.release.ident}\t{eref.ref.match_provenance}/{eref.ref.match_status}\t{eref.release.release_year or '-'}\t{eref.release.title}\t{eref.release.ext_ids.pmid or eref.release.ext_ids.doi or '-'}" ) else: - print(f"release_{ref.target_release_ident}") + print(f"release_{eref.ref.target_release_ident}") else: for ref in hits.result_refs: print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}") diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py index 34212a6a..e3228d30 100644 --- a/python/fatcat_tools/transforms/access.py +++ b/python/fatcat_tools/transforms/access.py @@ -39,7 +39,7 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]: TODO: proper implementation and filtering, instead of just returning first option found """ - options = [] + options: List[AccessOption] = [] for f in release.files or []: thumbnail_url = None if f.mimetype == "application/pdf" and f.sha1 and f.urls: diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index e39e9ea4..d4962205 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -7,6 +7,7 @@ from fatcat_openapi_client import ( ContainerEntity, EntityEdit, FileEntity, + FileUrl, ReleaseEntity, ) @@ -355,7 +356,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict: - other webarchive or repository URLs - any other URL """ - t = dict( + t: Dict[str, Any] = dict( file_count=len(release.files or []), fileset_count=len(release.filesets or []), webcapture_count=len(release.webcaptures or []), @@ -403,7 +404,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict: return t -def _rte_url_helper(url_obj) -> dict: +def _rte_url_helper(url_obj: FileUrl) -> Dict[str, Any]: """ Takes a location URL ('url' and 'rel' keys) and returns generic preservation status. @@ -427,7 +428,9 @@ def _rte_url_helper(url_obj) -> dict: return t -def container_to_elasticsearch(entity, force_bool=True, stats=None): +def container_to_elasticsearch( + entity: Any, force_bool: bool = True, stats: Optional[Dict[str, Any]] = None +) -> Dict[str, Any]: """ Converts from an entity model/schema to elasticsearch oriented schema. diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py index 799d5d6c..ee4017d8 100644 --- a/python/fatcat_tools/transforms/entities.py +++ b/python/fatcat_tools/transforms/entities.py @@ -1,6 +1,6 @@ import collections import json -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Mapping, Optional import toml from fatcat_openapi_client import ApiClient @@ -31,12 +31,12 @@ def entity_from_json( """ if not api_client: api_client = ApiClient() - thing = collections.namedtuple("Thing", ["data"]) + thing = collections.namedtuple("thing", ["data"]) thing.data = json_str return api_client.deserialize(thing, entity_type) -def entity_from_dict(obj: dict, entity_type, api_client=None): +def entity_from_dict(obj: Mapping[str, Any], entity_type, api_client=None): json_str = json.dumps(obj) return entity_from_json(json_str, entity_type, api_client=api_client) diff --git a/python/fatcat_web/editing_routes.py b/python/fatcat_web/editing_routes.py index 6dafd2f1..03668e1e 100644 --- a/python/fatcat_web/editing_routes.py +++ b/python/fatcat_web/editing_routes.py @@ -87,7 +87,7 @@ def generic_entity_delete_edit( def generic_entity_delete_entity( user_api, entity_type: str, editgroup_id: str, entity_ident: str -) -> None: +) -> EntityEdit: try: if entity_type == "container": edit = user_api.delete_container(editgroup_id, entity_ident) @@ -491,7 +491,6 @@ def generic_entity_delete(editgroup_id: Optional[str], entity_type: str, existin abort(400) # fetch entity (if set) or 404 - existing = None existing_edit = None if editgroup and existing_ident: existing, existing_edit = generic_get_editgroup_entity( diff --git a/python/fatcat_web/graphics.py b/python/fatcat_web/graphics.py index c76408cd..82a0a577 100644 --- a/python/fatcat_web/graphics.py +++ b/python/fatcat_web/graphics.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Tuple +from typing import Any, Dict, List, Tuple import pygal from pygal.style import CleanStyle @@ -12,17 +12,17 @@ def ia_coverage_histogram(rows: List[Tuple]) -> pygal.Graph: """ raw_years = [int(r[0]) for r in rows] - years = dict() + years_dict = dict() if raw_years: for y in range(min(raw_years), max(raw_years) + 1): - years[int(y)] = dict(year=int(y), available=0, missing=0) + years_dict[int(y)] = dict(year=int(y), available=0, missing=0) for r in rows: if r[1]: - years[int(r[0])]["available"] = r[2] + years_dict[int(r[0])]["available"] = r[2] else: - years[int(r[0])]["missing"] = r[2] + years_dict[int(r[0])]["missing"] = r[2] - years = sorted(years.values(), key=lambda x: x["year"]) + years: List[Dict[str, Any]] = sorted(years_dict.values(), key=lambda x: x["year"]) CleanStyle.colors = ("green", "purple") label_count = len(years) @@ -39,9 +39,9 @@ def ia_coverage_histogram(rows: List[Tuple]) -> pygal.Graph: # chart.title = "Perpetual Access Coverage" chart.x_title = "Year" # chart.y_title = "Releases" - chart.x_labels = [str(y["year"]) for y in years] - chart.add("via Fatcat", [y["available"] for y in years]) - chart.add("Missing", [y["missing"] for y in years]) + chart.x_labels = [str(v["year"]) for v in years] + chart.add("via Fatcat", [v["available"] for v in years]) + chart.add("Missing", [v["missing"] for v in years]) return chart diff --git a/python/fatcat_web/ref_routes.py b/python/fatcat_web/ref_routes.py index 6a5eb064..b45edf78 100644 --- a/python/fatcat_web/ref_routes.py +++ b/python/fatcat_web/ref_routes.py @@ -15,6 +15,7 @@ from fuzzycat.simple import close_fuzzy_biblio_matches, close_fuzzy_release_matc from fatcat_tools.references import ( RefHits, + RefHitsEnriched, enrich_inbound_refs, enrich_outbound_refs, get_inbound_refs, @@ -30,11 +31,11 @@ from fatcat_web.forms import ReferenceMatchForm def _refs_web( direction, release_ident=None, work_ident=None, openlibrary_id=None, wikipedia_article=None -) -> RefHits: - offset = request.args.get("offset", "0") - offset = max(0, int(offset)) if offset.isnumeric() else 0 - limit = request.args.get("limit", "30") - limit = min(max(0, int(limit)), 100) if limit.isnumeric() else 30 +) -> RefHitsEnriched: + offset_arg = request.args.get("offset", "0") + offset: int = max(0, int(offset_arg)) if offset_arg.isnumeric() else 0 + limit_arg = request.args.get("limit", "30") + limit: int = min(max(0, int(limit_arg)), 100) if limit_arg.isnumeric() else 30 if direction == "in": hits = get_inbound_refs( release_ident=release_ident, @@ -44,10 +45,12 @@ def _refs_web( offset=offset, limit=limit, ) - hits.result_refs = enrich_inbound_refs( - hits.result_refs, - fatcat_api_client=api, - expand="container,files,webcaptures", + enriched_hits = hits.as_enriched( + enrich_inbound_refs( + hits.result_refs, + fatcat_api_client=api, + expand="container,files,webcaptures", + ) ) elif direction == "out": hits = get_outbound_refs( @@ -58,14 +61,16 @@ def _refs_web( offset=offset, limit=limit, ) - hits.result_refs = enrich_outbound_refs( - hits.result_refs, - fatcat_api_client=api, - expand="container,files,webcaptures", + enriched_hits = hits.as_enriched( + enrich_outbound_refs( + hits.result_refs, + fatcat_api_client=api, + expand="container,files,webcaptures", + ) ) else: raise ValueError() - return hits + return enriched_hits @app.route("/release/<string(length=26):ident>/refs-in", methods=["GET"]) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py index 5fc3f614..5e758fd0 100644 --- a/python/fatcat_web/search.py +++ b/python/fatcat_web/search.py @@ -6,7 +6,7 @@ the formal API) import datetime import sys from dataclasses import dataclass -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional import elasticsearch import elasticsearch_dsl.response @@ -135,18 +135,22 @@ def wrap_es_execution(search: Search) -> Any: # this is a "user" error print("elasticsearch 400: " + str(e.info), file=sys.stderr) description = None + assert isinstance(e.info, dict) if e.info.get("error", {}).get("root_cause", {}): description = str(e.info["error"]["root_cause"][0].get("reason")) - raise FatcatSearchError(e.status_code, str(e.error), description) + raise FatcatSearchError(int(e.status_code), str(e.error), description) except elasticsearch.exceptions.ConnectionError as e: - raise FatcatSearchError(e.status_code, "ConnectionError: search engine not available") + raise FatcatSearchError( + int(e.status_code), "ConnectionError: search engine not available" + ) except elasticsearch.exceptions.TransportError as e: # all other errors print("elasticsearch non-200 status code: {}".format(e.info), file=sys.stderr) description = None + assert isinstance(e.info, dict) if e.info and e.info.get("error", {}).get("root_cause", {}): description = str(e.info["error"]["root_cause"][0].get("reason")) - raise FatcatSearchError(e.status_code, str(e.error), description) + raise FatcatSearchError(int(e.status_code), str(e.error), description) return resp @@ -285,7 +289,7 @@ def do_release_search(query: ReleaseQuery, deep_page_limit: int = 2000) -> Searc ) -def get_elastic_container_random_releases(ident: str, limit=5) -> dict: +def get_elastic_container_random_releases(ident: str, limit=5) -> List[Dict[str, Any]]: """ Returns a list of releases from the container. """ @@ -750,7 +754,7 @@ def get_elastic_preservation_by_date(query) -> List[dict]: resp = wrap_es_execution(search) buckets = resp.aggregations.date_preservation.buckets - date_dicts = dict() + date_dicts: Dict[str, Dict[str, Any]] = dict() this_date = start_date while this_date <= end_date: date_dicts[str(this_date)] = dict( |