aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/importers/crossref.py26
-rw-r--r--python/fatcat_tools/importers/dblp_release.py21
-rw-r--r--python/fatcat_tools/importers/doaj_article.py2
-rw-r--r--python/fatcat_tools/normal.py22
-rw-r--r--python/fatcat_tools/references.py48
-rw-r--r--python/fatcat_tools/transforms/access.py2
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py9
-rw-r--r--python/fatcat_tools/transforms/entities.py6
8 files changed, 86 insertions, 50 deletions
diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py
index 606d4bb1..d0017002 100644
--- a/python/fatcat_tools/importers/crossref.py
+++ b/python/fatcat_tools/importers/crossref.py
@@ -393,8 +393,6 @@ class CrossrefImporter(EntityImporter):
):
if clean(rm.get(k)):
ref_extra[k] = clean(rm[k])
- if not ref_extra:
- ref_extra = None
refs.append(
fatcat_openapi_client.ReleaseRef(
index=i,
@@ -406,7 +404,7 @@ class CrossrefImporter(EntityImporter):
title=clean(rm.get("article-title")),
locator=clean(rm.get("first-page")),
# TODO: just dump JSON somewhere here?
- extra=ref_extra,
+ extra=ref_extra or None,
)
)
@@ -421,8 +419,8 @@ class CrossrefImporter(EntityImporter):
)
# extra fields
- extra = dict()
- extra_crossref = dict()
+ extra: Dict[str, Any] = dict()
+ extra_crossref: Dict[str, Any] = dict()
# top-level extra keys
if not container_id:
if obj.get("container-title"):
@@ -471,13 +469,13 @@ class CrossrefImporter(EntityImporter):
"dissertation",
"book-chapter",
):
- release_stage = "published"
+ release_stage: Optional[str] = "published"
else:
# unknown
release_stage = None
# external identifiers
- extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower())
+ extids: Dict[str, Any] = self.lookup_ext_ids(doi=obj["DOI"].lower()) or {}
# filter out unreasonably huge releases
if len(abstracts) > 100:
@@ -512,7 +510,7 @@ class CrossrefImporter(EntityImporter):
title: Optional[str] = None
if obj.get("title"):
- title = clean(obj.get("title")[0], force_xml=True)
+ title = clean(obj["title"][0], force_xml=True)
if not title or len(title) <= 1:
# title can't be just a single character
self.counts["skip-blank-title"] += 1
@@ -520,15 +518,13 @@ class CrossrefImporter(EntityImporter):
subtitle = None
if obj.get("subtitle"):
- subtitle = clean(obj.get("subtitle")[0], force_xml=True)
+ subtitle = clean(obj["subtitle"][0], force_xml=True)
if not subtitle or len(subtitle) <= 1:
# subtitle can't be just a single character
subtitle = None
if extra_crossref:
extra["crossref"] = extra_crossref
- if not extra:
- extra = None
re = ReleaseEntity(
work_id=None,
@@ -556,10 +552,10 @@ class CrossrefImporter(EntityImporter):
pages=clean(obj.get("page")),
language=clean(obj.get("language")),
license_slug=license_slug,
- extra=extra,
- abstracts=abstracts,
- contribs=contribs,
- refs=refs,
+ extra=extra or None,
+ abstracts=abstracts or None,
+ contribs=contribs or None,
+ refs=refs or None,
)
return re
diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py
index 5baa6cd6..e73e5f33 100644
--- a/python/fatcat_tools/importers/dblp_release.py
+++ b/python/fatcat_tools/importers/dblp_release.py
@@ -26,6 +26,7 @@ import sys # noqa: F401
import warnings
from typing import Any, List, Optional
+import bs4
import fatcat_openapi_client
from fatcat_tools.importers.common import EntityImporter
@@ -420,7 +421,9 @@ class DblpReleaseImporter(EntityImporter):
)
)
- def dblp_contribs(self, authors: List[dict]) -> List[fatcat_openapi_client.ReleaseContrib]:
+ def dblp_contribs(
+ self, elem: bs4.element.Tag
+ ) -> List[fatcat_openapi_client.ReleaseContrib]:
"""
- author (multiple; each a single string)
=> may have HTML entities
@@ -431,21 +434,23 @@ class DblpReleaseImporter(EntityImporter):
"""
contribs = []
index = 0
- for elem in authors.find_all("author"):
+ for elem in elem.find_all("author"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "author"
contrib.index = index
contribs.append(contrib)
index += 1
- for elem in authors.find_all("editor"):
+ for elem in elem.find_all("editor"):
contrib = self.dblp_contrib_single(elem)
contrib.role = "editor"
contribs.append(contrib)
return contribs
- def dblp_contrib_single(self, elem: Any) -> fatcat_openapi_client.ReleaseContrib:
+ def dblp_contrib_single(
+ self, elem: bs4.element.Tag
+ ) -> fatcat_openapi_client.ReleaseContrib:
"""
In the future, might try to implement creator key-ificiation and lookup here.
@@ -461,11 +466,15 @@ class DblpReleaseImporter(EntityImporter):
raw_name = clean_str(elem.text)
# remove number in author name, if present
- if raw_name.split()[-1].isdigit():
+ if raw_name and raw_name.split()[-1].isdigit():
raw_name = " ".join(raw_name.split()[:-1])
if elem.get("orcid"):
- orcid = clean_orcid(elem["orcid"])
+ orcid_val = elem["orcid"]
+ if isinstance(orcid_val, list):
+ orcid = clean_orcid(orcid_val[0])
+ else:
+ orcid = clean_orcid(orcid_val)
if orcid:
creator_id = self.lookup_orcid(orcid)
if not creator_id:
diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py
index cd063337..56045ea7 100644
--- a/python/fatcat_tools/importers/doaj_article.py
+++ b/python/fatcat_tools/importers/doaj_article.py
@@ -382,7 +382,7 @@ class DoajArticleImporter(EntityImporter):
if not license.get("open_access"):
continue
slug = license.get("type")
- if slug.startswith("CC "):
+ if slug and slug.startswith("CC "):
slug = slug.replace("CC ", "cc-").lower()
return slug
return None
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 12c58829..daf47ded 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -15,7 +15,7 @@ import pycountry
DOI_REGEX = re.compile(r"^10.\d{3,6}/\S+$")
-def clean_doi(raw: str) -> Optional[str]:
+def clean_doi(raw: Optional[str]) -> Optional[str]:
"""
Removes any:
- padding whitespace
@@ -95,7 +95,7 @@ def test_clean_doi():
ARXIV_ID_REGEX = re.compile(r"^(\d{4}.\d{4,5}|[a-z\-]+(\.[A-Z]{2})?/\d{7})(v\d+)?$")
-def clean_arxiv_id(raw: str) -> Optional[str]:
+def clean_arxiv_id(raw: Optional[str]) -> Optional[str]:
"""
Removes any:
- 'arxiv:' prefix
@@ -170,7 +170,7 @@ def test_clean_wikidata_qid():
assert clean_wikidata_qid("") is None
-def clean_pmid(raw: str) -> Optional[str]:
+def clean_pmid(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -189,7 +189,7 @@ def test_clean_pmid():
assert clean_pmid("") is None
-def clean_pmcid(raw: str) -> Optional[str]:
+def clean_pmcid(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -200,7 +200,7 @@ def clean_pmcid(raw: str) -> Optional[str]:
return None
-def clean_sha1(raw: str) -> Optional[str]:
+def clean_sha1(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip().lower()
@@ -228,7 +228,9 @@ def test_clean_sha1():
assert clean_sha1("0fba3fb a0e1937aa0297de3836b768b5dfb23d7b") is None
-def clean_sha256(raw: str) -> Optional[str]:
+def clean_sha256(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
raw = raw.strip().lower()
if len(raw.split()) != 1:
return None
@@ -251,7 +253,7 @@ def test_clean_sha256():
ISSN_REGEX = re.compile(r"^\d{4}-\d{3}[0-9X]$")
-def clean_issn(raw: str) -> Optional[str]:
+def clean_issn(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip().upper()
@@ -272,7 +274,7 @@ def test_clean_issn():
ISBN13_REGEX = re.compile(r"^97(?:8|9)-\d{1,5}-\d{1,7}-\d{1,6}-\d$")
-def clean_isbn13(raw: str) -> Optional[str]:
+def clean_isbn13(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -291,7 +293,7 @@ def test_clean_isbn13():
ORCID_REGEX = re.compile(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$")
-def clean_orcid(raw: str) -> Optional[str]:
+def clean_orcid(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
raw = raw.strip()
@@ -472,7 +474,7 @@ def test_parse_month() -> None:
assert parse_month("September") == 9
-def detect_text_lang(raw: str) -> Optional[str]:
+def detect_text_lang(raw: Optional[str]) -> Optional[str]:
"""
Tries to determine language of, eg, an abstract.
diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py
index 6fd9ca49..624020b5 100644
--- a/python/fatcat_tools/references.py
+++ b/python/fatcat_tools/references.py
@@ -124,7 +124,33 @@ class RefHits(BaseModel):
limit: int
query_time_ms: int
query_wall_time_ms: int
- result_refs: List[Union[BiblioRef, EnrichedBiblioRef]]
+ result_refs: List[BiblioRef]
+
+ class Config:
+ json_encoders = {
+ ReleaseEntity: entity_to_dict,
+ }
+
+ def as_enriched(self, enriched_refs: List[EnrichedBiblioRef]) -> "RefHitsEnriched":
+ return RefHitsEnriched(
+ count_returned=self.count_returned,
+ count_total=self.count_total,
+ offset=self.offset,
+ limit=self.limit,
+ query_time_ms=self.query_time_ms,
+ query_wall_time_ms=self.query_wall_time_ms,
+ result_refs=enriched_refs,
+ )
+
+
+class RefHitsEnriched(BaseModel):
+ count_returned: int
+ count_total: int
+ offset: int
+ limit: int
+ query_time_ms: int
+ query_wall_time_ms: int
+ result_refs: List[EnrichedBiblioRef]
class Config:
json_encoders = {
@@ -221,7 +247,7 @@ def get_inbound_refs(
limit: int = 25,
offset: Optional[int] = None,
es_index: str = "fatcat_ref",
-) -> List[BiblioRef]:
+) -> RefHits:
search = Search(using=es_client, index=es_index)
@@ -398,16 +424,16 @@ def run_ref_query(args) -> None:
enriched = enrich_outbound_refs(
hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
)
- for ref in enriched:
- if ref.release:
+ for eref in enriched:
+ if eref.release:
print(
- f"{ref.ref.ref_index or '-'}\trelease_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+ f"{eref.ref.ref_index or '-'}\trelease_{eref.release.ident}\t{eref.ref.match_provenance}/{eref.ref.match_status}\t{eref.release.release_year or '-'}\t{eref.release.title}\t{eref.release.ext_ids.pmid or eref.release.ext_ids.doi or '-'}"
)
else:
- print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}")
+ print(f"{eref.ref.ref_index or '-'}\trelease_{eref.ref.target_release_ident}")
else:
for ref in hits.result_refs:
- print(f"{ref.ref.ref_index or '-'}\trelease_{ref.target_release_ident}")
+ print(f"{ref.ref_index or '-'}\trelease_{ref.target_release_ident}")
print()
print("## Inbound References")
@@ -423,13 +449,13 @@ def run_ref_query(args) -> None:
enriched = enrich_inbound_refs(
hits.result_refs, hide="refs,abstracts", fatcat_api_client=args.fatcat_api_client
)
- for ref in enriched:
- if ref.release:
+ for eref in enriched:
+ if eref.release:
print(
- f"release_{ref.release.ident}\t{ref.ref.match_provenance}/{ref.ref.match_status}\t{ref.release.release_year or '-'}\t{ref.release.title}\t{ref.release.ext_ids.pmid or ref.release.ext_ids.doi or '-'}"
+ f"release_{eref.release.ident}\t{eref.ref.match_provenance}/{eref.ref.match_status}\t{eref.release.release_year or '-'}\t{eref.release.title}\t{eref.release.ext_ids.pmid or eref.release.ext_ids.doi or '-'}"
)
else:
- print(f"release_{ref.target_release_ident}")
+ print(f"release_{eref.ref.target_release_ident}")
else:
for ref in hits.result_refs:
print(f"work_{ref.source_work_ident}\trelease_{ref.source_release_ident}")
diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py
index 34212a6a..e3228d30 100644
--- a/python/fatcat_tools/transforms/access.py
+++ b/python/fatcat_tools/transforms/access.py
@@ -39,7 +39,7 @@ def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
TODO: proper implementation and filtering, instead of just returning first
option found
"""
- options = []
+ options: List[AccessOption] = []
for f in release.files or []:
thumbnail_url = None
if f.mimetype == "application/pdf" and f.sha1 and f.urls:
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index e39e9ea4..d4962205 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -7,6 +7,7 @@ from fatcat_openapi_client import (
ContainerEntity,
EntityEdit,
FileEntity,
+ FileUrl,
ReleaseEntity,
)
@@ -355,7 +356,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:
- other webarchive or repository URLs
- any other URL
"""
- t = dict(
+ t: Dict[str, Any] = dict(
file_count=len(release.files or []),
fileset_count=len(release.filesets or []),
webcapture_count=len(release.webcaptures or []),
@@ -403,7 +404,7 @@ def _rte_content_helper(release: ReleaseEntity) -> dict:
return t
-def _rte_url_helper(url_obj) -> dict:
+def _rte_url_helper(url_obj: FileUrl) -> Dict[str, Any]:
"""
Takes a location URL ('url' and 'rel' keys) and returns generic preservation status.
@@ -427,7 +428,9 @@ def _rte_url_helper(url_obj) -> dict:
return t
-def container_to_elasticsearch(entity, force_bool=True, stats=None):
+def container_to_elasticsearch(
+ entity: Any, force_bool: bool = True, stats: Optional[Dict[str, Any]] = None
+) -> Dict[str, Any]:
"""
Converts from an entity model/schema to elasticsearch oriented schema.
diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py
index 799d5d6c..ee4017d8 100644
--- a/python/fatcat_tools/transforms/entities.py
+++ b/python/fatcat_tools/transforms/entities.py
@@ -1,6 +1,6 @@
import collections
import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Mapping, Optional
import toml
from fatcat_openapi_client import ApiClient
@@ -31,12 +31,12 @@ def entity_from_json(
"""
if not api_client:
api_client = ApiClient()
- thing = collections.namedtuple("Thing", ["data"])
+ thing = collections.namedtuple("thing", ["data"])
thing.data = json_str
return api_client.deserialize(thing, entity_type)
-def entity_from_dict(obj: dict, entity_type, api_client=None):
+def entity_from_dict(obj: Mapping[str, Any], entity_type, api_client=None):
json_str = json.dumps(obj)
return entity_from_json(json_str, entity_type, api_client=api_client)