diff options
Diffstat (limited to 'python/fatcat_tools/importers')
22 files changed, 443 insertions, 274 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index ae4f9049..2fb7be55 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url @@ -36,7 +39,9 @@ class ArabesqueMatchImporter(EntityImporter): - a mode to insert bare files even if identifier not known? """ - def __init__(self, api, extid_type, require_grobid=True, **kwargs): + def __init__( + self, api: ApiClient, extid_type: str, require_grobid: bool = True, **kwargs + ) -> None: eg_desc = ( kwargs.get("editgroup_description", None) @@ -59,7 +64,7 @@ class ArabesqueMatchImporter(EntityImporter): else: print("NOT checking GROBID status column") - def want(self, row): + def want(self, row: Any) -> bool: if self.require_grobid and not row["postproc_status"] == "200": return False if ( @@ -76,7 +81,7 @@ class ArabesqueMatchImporter(EntityImporter): else: return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[FileEntity]: extid = row["identifier"].strip() @@ -131,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -182,7 +187,7 @@ class ArabesqueMatchImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 0957db2c..1d50dd9a 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -2,9 +2,11 @@ import datetime import json import re import sys +from typing import Any, Dict, List, Optional import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text from .common import EntityImporter @@ -13,7 +15,7 @@ from .crossref import lookup_license_slug latex2text = LatexNodes2Text() -def latex_to_text(raw): +def latex_to_text(raw: str) -> str: try: return latex2text.latex_to_text(raw).strip() except AttributeError: @@ -22,7 +24,7 @@ def latex_to_text(raw): return raw.strip() -def parse_arxiv_authors(raw): +def parse_arxiv_authors(raw: str) -> List[str]: if not raw: return [] raw = raw.replace("*", "") @@ -41,7 +43,7 @@ def parse_arxiv_authors(raw): return authors -def test_parse_arxiv_authors(): +def test_parse_arxiv_authors() -> None: assert parse_arxiv_authors( "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" @@ -88,7 +90,7 @@ class ArxivRawImporter(EntityImporter): the "most recent" version; can be a simple sort? """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -107,15 +109,17 @@ class ArxivRawImporter(EntityImporter): ) self._test_override = False - def parse_record(self, record): + # TODO: record is really a beautiful soup element, but setting to 'Any' to + # make initial type annotations simple + def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]: if not record: return None metadata = record.arXivRaw if not metadata: return None - extra = dict() - extra_arxiv = dict() + extra: Dict[str, Any] = dict() + extra_arxiv: Dict[str, Any] = dict() # don't know! release_type = "article" @@ -134,7 +138,7 @@ class ArxivRawImporter(EntityImporter): for i, a in enumerate(authors) ] - lang = "en" # the vast majority in english + lang: Optional[str] = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace("\n", " ").strip() extra_arxiv["comments"] = comments @@ -229,7 +233,7 @@ class ArxivRawImporter(EntityImporter): ).date() # TODO: source_type? versions.append( - fatcat_openapi_client.ReleaseEntity( + ReleaseEntity( work_id=None, title=title, # original_title @@ -261,7 +265,7 @@ class ArxivRawImporter(EntityImporter): versions[-1].release_stage = "accepted" return versions - def try_update(self, versions): + def try_update(self, versions: List[ReleaseEntity]) -> bool: """ This is pretty complex! There is no batch/bezerk mode for arxiv importer. @@ -344,7 +348,7 @@ class ArxivRawImporter(EntityImporter): return False - def insert_batch(self, batch_batch): + def insert_batch(self, batch_batch: List[ReleaseEntity]) -> None: # there is no batch/bezerk mode for arxiv importer, except for testing if self._test_override: for batch in batch_batch: @@ -360,7 +364,7 @@ class ArxivRawImporter(EntityImporter): else: raise NotImplementedError() - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index e9de42fc..b88117e0 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -7,10 +7,13 @@ import os import subprocess import sys import urllib +import urllib.parse +from typing import Any, Dict, List, Optional, Tuple import fatcat_openapi_client import magic from fatcat_openapi_client import ( + ApiClient, Editgroup, FilesetEntity, FilesetFile, @@ -24,7 +27,7 @@ from .common import clean from .crossref import lookup_license_slug -def single_file(prefix, path): +def single_file(prefix: str, path: str) -> FilesetFile: full = prefix + path size_bytes = os.stat(full).st_size @@ -59,7 +62,7 @@ def single_file(prefix, path): return fsf -def make_manifest(base_dir): +def make_manifest(base_dir: str) -> List[FilesetFile]: manifest = [] for root, dirs, files in os.walk(base_dir): for f in files: @@ -67,7 +70,9 @@ def make_manifest(base_dir): return manifest -def cdl_dash_release(meta, extra=None): +def cdl_dash_release( + meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None +) -> ReleaseEntity: if not extra: extra = dict() @@ -124,7 +129,7 @@ def cdl_dash_release(meta, extra=None): return r -def make_release_fileset(dat_path): +def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: if dat_path.endswith("/"): dat_path = dat_path[:-1] @@ -170,7 +175,12 @@ def make_release_fileset(dat_path): return (release, fs) -def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): +def auto_cdl_dash_dat( + api: ApiClient, + dat_path: str, + release_id: Optional[str] = None, + editgroup_id: Optional[str] = None, +) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 8d2a89b6..842c7853 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from .common import EntityImporter, clean @@ -12,7 +15,7 @@ class ChoculaImporter(EntityImporter): See guide for details on the many 'extra' fields used here. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -22,7 +25,7 @@ class ChoculaImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: if not raw_record.get("ident") and not raw_record.get("_known_issnl"): self.counts["skip-unknown-new-issnl"] += 1 return False @@ -30,7 +33,7 @@ class ChoculaImporter(EntityImporter): return True return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -75,7 +78,7 @@ class ChoculaImporter(EntityImporter): elif "journal " in name.lower(): container_type = "journal" - ce = fatcat_openapi_client.ContainerEntity( + ce = ContainerEntity( issnl=row["issnl"], issnp=row["extra"].get("issnp"), issne=row["extra"].get("issne"), @@ -88,7 +91,7 @@ class ChoculaImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: existing = None if ce.ident: @@ -193,7 +196,7 @@ class ChoculaImporter(EntityImporter): # if we got this far, it's a bug raise NotImplementedError - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 0b68e5fe..fd472d11 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,7 +7,7 @@ import subprocess import sys import xml.etree.ElementTree as ET from collections import Counter -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple import elasticsearch import fatcat_openapi_client @@ -16,7 +16,14 @@ import fuzzycat.verify import lxml from bs4 import BeautifulSoup from confluent_kafka import Consumer, KafkaException -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ( + ApiClient, + ContainerEntity, + EntityEdit, + FileEntity, + FilesetEntity, + ReleaseEntity, +) from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy @@ -90,7 +97,7 @@ DOMAIN_REL_MAP: Dict[str, str] = { } -def make_rel_url(raw_url: str, default_link_rel: str = "web"): +def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]: # this is where we map specific domains to rel types, and also filter out # bad domains, invalid URLs, etc rel = default_link_rel @@ -101,7 +108,7 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"): return (rel, raw_url) -def test_make_rel_url(): +def test_make_rel_url() -> None: assert make_rel_url("http://example.com/thing.pdf")[0] == "web" assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" assert ( @@ -145,7 +152,7 @@ class EntityImporter: implementors must write insert_batch appropriately """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["git_rev"] = eg_extra.get( @@ -212,7 +219,7 @@ class EntityImporter: # implementations should fill this in raise NotImplementedError - def finish(self): + def finish(self) -> Counter: """ Gets called as cleanup at the end of imports, but can also be called at any time to "snip off" current editgroup progress. In other words, safe @@ -238,7 +245,7 @@ class EntityImporter: return self.counts - def get_editgroup_id(self, edits=1): + def get_editgroup_id(self, edits: int = 1) -> str: if self._edit_count >= self.edit_batch_size: if self.submit_mode: self.api.submit_editgroup(self._editgroup_id) @@ -257,30 +264,31 @@ class EntityImporter: self._editgroup_id = eg.editgroup_id self._edit_count += edits + assert self._editgroup_id return self._editgroup_id - def create_container(self, entity): + def create_container(self, entity: ContainerEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.container"] += 1 return self.api.create_container(eg_id, entity) - def create_release(self, entity): + def create_release(self, entity: ReleaseEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.release"] += 1 return self.api.create_release(eg_id, entity) - def create_file(self, entity): + def create_file(self, entity: FileEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.file"] += 1 return self.api.create_file(eg_id, entity) - def updated(self): + def updated(self) -> None: """ Implementations should call this from try_update() if the update was successful """ self.counts["update"] += 1 - def push_entity(self, entity): + def push_entity(self, entity: Any) -> None: self._entity_queue.append(entity) if len(self._entity_queue) >= self.edit_batch_size: self.insert_batch(self._entity_queue) @@ -294,7 +302,7 @@ class EntityImporter: """ return True - def try_update(self, raw_record): + def try_update(self, raw_record: Any) -> Optional[bool]: """ Passed the output of parse_record(). Should try to find an existing entity and update it (PUT), decide we should do nothing (based on the @@ -307,15 +315,17 @@ class EntityImporter: """ raise NotImplementedError - def insert_batch(self, raw_records: List[Any]): + def insert_batch(self, raw_records: List[Any]) -> None: raise NotImplementedError def is_orcid(self, orcid: str) -> bool: # TODO: replace with clean_orcid() from fatcat_tools.normal return self._orcid_regex.match(orcid) is not None - def lookup_orcid(self, orcid: str): - """Caches calls to the Orcid lookup API endpoint in a local dict""" + def lookup_orcid(self, orcid: str) -> Optional[str]: + """Caches calls to the Orcid lookup API endpoint in a local dict. + + Returns a creator fatcat ident if found, else None""" if not self.is_orcid(orcid): return None if orcid in self._orcid_id_map: @@ -335,7 +345,7 @@ class EntityImporter: # TODO: replace with clean_doi() from fatcat_tools.normal return doi.startswith("10.") and doi.count("/") >= 1 - def lookup_doi(self, doi: str): + def lookup_doi(self, doi: str) -> Optional[str]: """Caches calls to the doi lookup API endpoint in a local dict For identifier lookups only (not full object fetches)""" @@ -354,7 +364,7 @@ class EntityImporter: self._doi_id_map[doi] = release_id # might be None return release_id - def lookup_pmid(self, pmid: str): + def lookup_pmid(self, pmid: str) -> Optional[str]: """Caches calls to the pmid lookup API endpoint in a local dict For identifier lookups only (not full object fetches)""" @@ -374,7 +384,7 @@ class EntityImporter: def is_issnl(self, issnl: str) -> bool: return len(issnl) == 9 and issnl[4] == "-" - def lookup_issnl(self, issnl: str): + def lookup_issnl(self, issnl: str) -> Optional[str]: """Caches calls to the ISSN-L lookup API endpoint in a local dict""" if issnl in self._issnl_id_map: return self._issnl_id_map[issnl] @@ -389,7 +399,7 @@ class EntityImporter: self._issnl_id_map[issnl] = container_id # might be None return container_id - def read_issn_map_file(self, issn_map_file): + def read_issn_map_file(self, issn_map_file: Sequence) -> None: print("Loading ISSN map file...", file=sys.stderr) self._issn_issnl_map = dict() for line in issn_map_file: @@ -407,7 +417,7 @@ class EntityImporter: return self._issn_issnl_map.get(issn) @staticmethod - def generic_file_cleanups(existing): + def generic_file_cleanups(existing: FileEntity) -> FileEntity: """ Conservative cleanup of existing file entities. @@ -453,7 +463,7 @@ class EntityImporter: return existing @staticmethod - def generic_fileset_cleanups(existing): + def generic_fileset_cleanups(existing: FilesetEntity) -> FilesetEntity: return existing def match_existing_release_fuzzy( @@ -520,10 +530,10 @@ class RecordPusher: wraps an importer and pushes records in to it. """ - def __init__(self, importer, **kwargs): + def __init__(self, importer: EntityImporter, **kwargs) -> None: self.importer = importer - def run(self): + def run(self) -> Counter: """ This will look something like: @@ -536,11 +546,11 @@ class RecordPusher: class JsonLinePusher(RecordPusher): - def __init__(self, importer, json_file, **kwargs): + def __init__(self, importer: EntityImporter, json_file: Sequence, **kwargs) -> None: self.importer = importer self.json_file = json_file - def run(self): + def run(self) -> Counter: for line in self.json_file: if not line: continue @@ -552,11 +562,11 @@ class JsonLinePusher(RecordPusher): class CsvPusher(RecordPusher): - def __init__(self, importer, csv_file, **kwargs): + def __init__(self, importer: EntityImporter, csv_file: Any, **kwargs) -> None: self.importer = importer self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ",")) - def run(self): + def run(self) -> Counter: for line in self.reader: if not line: continue @@ -567,11 +577,11 @@ class CsvPusher(RecordPusher): class LinePusher(RecordPusher): - def __init__(self, importer, text_file, **kwargs): + def __init__(self, importer: EntityImporter, text_file: Sequence, **kwargs) -> None: self.importer = importer self.text_file = text_file - def run(self): + def run(self) -> Counter: for line in self.text_file: if not line: continue @@ -582,14 +592,21 @@ class LinePusher(RecordPusher): class SqlitePusher(RecordPusher): - def __init__(self, importer, db_file, table_name, where_clause="", **kwargs): + def __init__( + self, + importer: EntityImporter, + db_file: str, + table_name: str, + where_clause: str = "", + **kwargs + ) -> None: self.importer = importer self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.db.row_factory = sqlite3.Row self.table_name = table_name self.where_clause = where_clause - def run(self): + def run(self) -> Counter: cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause)) for row in cur: self.importer.push_record(row) @@ -599,12 +616,18 @@ class SqlitePusher(RecordPusher): class Bs4XmlLinesPusher(RecordPusher): - def __init__(self, importer, xml_file, prefix_filter=None, **kwargs): + def __init__( + self, + importer: EntityImporter, + xml_file: Sequence, + prefix_filter: Optional[str] = None, + **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.prefix_filter = prefix_filter - def run(self): + def run(self) -> Counter: for line in self.xml_file: if not line: continue @@ -619,12 +642,14 @@ class Bs4XmlLinesPusher(RecordPusher): class Bs4XmlFilePusher(RecordPusher): - def __init__(self, importer, xml_file, record_tag, **kwargs): + def __init__( + self, importer: EntityImporter, xml_file: Any, record_tag: str, **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.record_tag = record_tag - def run(self): + def run(self) -> Counter: soup = BeautifulSoup(self.xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) @@ -654,13 +679,20 @@ class Bs4XmlLargeFilePusher(RecordPusher): by inner container/release API lookup caches. """ - def __init__(self, importer, xml_file, record_tags, use_lxml=False, **kwargs): + def __init__( + self, + importer: EntityImporter, + xml_file: Any, + record_tags: List[str], + use_lxml: bool = False, + **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.record_tags = record_tags self.use_lxml = use_lxml - def run(self): + def run(self) -> Counter: if self.use_lxml: elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"], load_dtd=True) else: @@ -691,12 +723,14 @@ class Bs4XmlLargeFilePusher(RecordPusher): class Bs4XmlFileListPusher(RecordPusher): - def __init__(self, importer, list_file, record_tag, **kwargs): + def __init__( + self, importer: EntityImporter, list_file: Sequence, record_tag: str, **kwargs + ) -> None: self.importer = importer self.list_file = list_file self.record_tag = record_tag - def run(self): + def run(self) -> Counter: for xml_path in self.list_file: xml_path = xml_path.strip() if not xml_path or xml_path.startswith("#"): @@ -717,7 +751,15 @@ class KafkaBs4XmlPusher(RecordPusher): Fetch XML for an article from Kafka, parse via Bs4. """ - def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + def __init__( + self, + importer: EntityImporter, + kafka_hosts: str, + kafka_env: str, + topic_suffix: str, + group: str, + **kwargs + ) -> None: self.importer = importer self.consumer = make_kafka_consumer( kafka_hosts, @@ -729,7 +771,7 @@ class KafkaBs4XmlPusher(RecordPusher): self.poll_interval = kwargs.get("poll_interval", 5.0) self.consume_batch_size = kwargs.get("consume_batch_size", 25) - def run(self): + def run(self) -> Counter: count = 0 last_push = datetime.datetime.now() while True: @@ -784,7 +826,15 @@ class KafkaBs4XmlPusher(RecordPusher): class KafkaJsonPusher(RecordPusher): - def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + def __init__( + self, + importer: EntityImporter, + kafka_hosts: str, + kafka_env: str, + topic_suffix: str, + group: str, + **kwargs + ) -> None: self.importer = importer self.consumer = make_kafka_consumer( kafka_hosts, @@ -797,7 +847,7 @@ class KafkaJsonPusher(RecordPusher): self.consume_batch_size = kwargs.get("consume_batch_size", 100) self.force_flush = kwargs.get("force_flush", False) - def run(self): + def run(self) -> Counter: count = 0 last_push = datetime.datetime.now() last_force_flush = datetime.datetime.now() @@ -862,10 +912,12 @@ class KafkaJsonPusher(RecordPusher): return counts -def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat"): +def make_kafka_consumer( + hosts: str, env: str, topic_suffix: str, group: str, kafka_namespace: str = "fatcat" +) -> Consumer: topic_name = "{}-{}.{}".format(kafka_namespace, env, topic_suffix) - def fail_fast(err, partitions): + def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") @@ -900,7 +952,7 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat }, } - def on_rebalance(consumer, partitions): + def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d0017002..689989d2 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,9 +1,9 @@ import datetime import sqlite3 -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from .common import EntityImporter, clean @@ -90,7 +90,7 @@ LICENSE_SLUG_MAP: Dict[str, str] = { } -def lookup_license_slug(raw: str) -> Optional[str]: +def lookup_license_slug(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip().replace("http://", "//").replace("https://", "//") @@ -102,7 +102,7 @@ def lookup_license_slug(raw: str) -> Optional[str]: return LICENSE_SLUG_MAP.get(raw) -def test_lookup_license_slug(): +def test_lookup_license_slug() -> None: assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" assert ( @@ -133,13 +133,13 @@ class CrossrefImporter(EntityImporter): See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc: Optional[str] = kwargs.get( "editgroup_description", "Automated import of Crossref DOI metadata, harvested from REST API", ) - eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) + eg_extra: Dict[str, Any] = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") super().__init__( api, @@ -249,7 +249,7 @@ class CrossrefImporter(EntityImporter): release_type = self.map_release_type(obj["type"]) # contribs - def do_contribs(obj_list, ctype): + def do_contribs(obj_list: List[Dict[str, Any]], ctype: str) -> List[ReleaseContrib]: contribs = [] for i, am in enumerate(obj_list): creator_id = None @@ -257,15 +257,15 @@ class CrossrefImporter(EntityImporter): creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( if am.get("given") and am.get("family"): - raw_name = "{} {}".format(am["given"], am["family"]) + raw_name: Optional[str] = "{} {}".format(am["given"], am["family"]) elif am.get("family"): raw_name = am["family"] else: # TODO: can end up empty raw_name = am.get("name") or am.get("given") - extra = dict() + extra: Dict[str, Any] = dict() if ctype == "author": - index = i + index: Optional[int] = i else: index = None raw_affiliation = None @@ -284,7 +284,7 @@ class CrossrefImporter(EntityImporter): assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append( - fatcat_openapi_client.ReleaseContrib( + ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, @@ -559,7 +559,7 @@ class CrossrefImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing DOI (don't need to try other ext idents for crossref) existing = None @@ -577,7 +577,7 @@ class CrossrefImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4c174b0b..7cc5fa20 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -14,11 +14,13 @@ import json import re import sqlite3 import sys +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple import dateparser import fatcat_openapi_client import langdetect import pycountry +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict @@ -29,7 +31,7 @@ from .common import EntityImporter, clean MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP = { +CONTAINER_TYPE_MAP: Dict[str, str] = { "Journal": "journal", "Series": "journal", "Book Series": "book-series", @@ -38,7 +40,7 @@ CONTAINER_TYPE_MAP = { # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. -DATACITE_TYPE_MAP = { +DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { "ris": { "THES": "thesis", "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) @@ -128,7 +130,7 @@ DATACITE_TYPE_MAP = { } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. -DATACITE_UNKNOWN_MARKERS = ( +DATACITE_UNKNOWN_MARKERS: List[str] = [ "(:unac)", # temporarily inaccessible "(:unal)", # unallowed, suppressed intentionally "(:unap)", # not applicable, makes no sense @@ -139,11 +141,11 @@ DATACITE_UNKNOWN_MARKERS = ( "(:null)", # explicitly and meaningfully empty "(:tba)", # to be assigned or announced later "(:etal)", # too numerous to list (et alia) -) +] # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( +UNKNOWN_MARKERS: Set[str] = set(DATACITE_UNKNOWN_MARKERS).union( set( ( "NA", @@ -159,7 +161,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi -DATACITE_TITLE_SPAM_WORDGROUPS = [ +DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [ { "tokens": ( "full", @@ -180,7 +182,7 @@ DATACITE_TITLE_SPAM_WORDGROUPS = [ ] # TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP = { +LICENSE_SLUG_MAP: Dict[str, str] = { "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", @@ -222,7 +224,14 @@ class DataciteImporter(EntityImporter): Importer for datacite records. """ - def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): + def __init__( + self, + api: ApiClient, + issn_map_file: Sequence, + debug: bool = False, + insert_log_file: bool = None, + **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -255,7 +264,7 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi): + def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: """ Return dictionary of identifiers referring to the same things as the given DOI. """ @@ -291,7 +300,7 @@ class DataciteImporter(EntityImporter): jstor_id=None, ) - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. """ @@ -413,7 +422,7 @@ class DataciteImporter(EntityImporter): # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = "published" + release_stage: Optional[str] = "published" # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: @@ -628,7 +637,7 @@ class DataciteImporter(EntityImporter): release_type = "review" # Extra information. - extra_datacite = dict() + extra_datacite: Dict[str, Any] = dict() if license_extra: extra_datacite["license"] = license_extra @@ -675,7 +684,7 @@ class DataciteImporter(EntityImporter): if relations: extra_datacite["relations"] = relations - extra = dict() + extra: Dict[str, Any] = dict() # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", @@ -734,7 +743,7 @@ class DataciteImporter(EntityImporter): return re @staticmethod - def datacite_release_type(doi, attributes): + def datacite_release_type(doi: str, attributes: Dict[str, Any]) -> Optional[str]: """ Release type. Try to determine the release type from a variety of types supplied in datacite. The "attributes.types.resourceType" is @@ -766,7 +775,7 @@ class DataciteImporter(EntityImporter): return release_type @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. @@ -817,7 +826,7 @@ class DataciteImporter(EntityImporter): return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: """ When debug is true, write the RE to stdout, not to the database. Might hide schema mismatch bugs. @@ -842,7 +851,7 @@ class DataciteImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: print("inserting batch ({})".format(len(batch)), file=sys.stderr) if self.insert_log_file: with open(self.insert_log_file, "a") as f: @@ -858,7 +867,13 @@ class DataciteImporter(EntityImporter): ) ) - def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None): + def parse_datacite_creators( + self, + creators: List[Dict[str, Any]], + role: str = "author", + set_index: bool = True, + doi: Optional[str] = None, + ) -> List[ReleaseContrib]: """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. @@ -868,12 +883,12 @@ class DataciteImporter(EntityImporter): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. - contribs = [] + contribs: List[ReleaseContrib] = [] # Names, that should be ignored right away. name_blocklist = set(("Occdownload Gbif.Org",)) - i = 0 + i: Optional[int] = 0 for c in creators: if not set_index: i = None @@ -983,7 +998,9 @@ class DataciteImporter(EntityImporter): return contribs -def contributor_list_contains_contributor(contributor_list, contributor): +def contributor_list_contains_contributor( + contributor_list: ReleaseContrib, contributor: ReleaseContrib +) -> bool: """ Given a list of contributors, determine, whether contrib is in that list. """ @@ -998,7 +1015,7 @@ def contributor_list_contains_contributor(contributor_list, contributor): return False -def lookup_license_slug(raw): +def lookup_license_slug(raw: Optional[str]) -> Optional[str]: """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. @@ -1101,7 +1118,9 @@ def lookup_license_slug(raw): return LICENSE_SLUG_MAP.get(raw) -def find_original_language_title(item, min_length=4, max_questionmarks=3): +def find_original_language_title( + item: Dict[str, Any], min_length: int = 4, max_questionmarks: int = 3 +) -> Optional[str]: """ Perform a few checks before returning a potential original language title. @@ -1126,7 +1145,9 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3): return None -def parse_datacite_titles(titles): +def parse_datacite_titles( + titles: List[Dict[str, Any]] +) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ Given a list of title items from datacite, return 3-tuple (title, original_language_title, subtitle). @@ -1158,7 +1179,9 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle -def parse_single_date(value): +def parse_single_date( + value: Optional[str], +) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: """ Given a single string containing a date in arbitrary format, try to return tuple (date: datetime.date, month: int, year: int). @@ -1186,10 +1209,12 @@ def parse_single_date(value): return None, None, None -def parse_datacite_dates(dates): +def parse_datacite_dates( + dates: List[Dict[str, Any]], +) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: """ Given a list of date fields (under .dates), return tuple, (release_date, - release_year). + release_month, release_year). """ release_date, release_month, release_year = None, None, None @@ -1226,9 +1251,13 @@ def parse_datacite_dates(dates): Pattern("%Y", "y"), ) - def parse_item(item): + def parse_item( + item: Dict[str, Any] + ) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: result, value, year_only = None, str(item.get("date", "")) or "", False - release_date, release_month, release_year = None, None, None + release_date: Optional[datetime.date] = None + release_month: Optional[int] = None + release_year: Optional[int] = None for layout, granularity in common_patterns: try: @@ -1285,7 +1314,7 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year -def index_form_to_display_name(s): +def index_form_to_display_name(s: str) -> str: """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index 603a6271..36fe5f00 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -4,8 +4,10 @@ pre-scraped in to JSON from HTML pages. """ import sys # noqa: F401 +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import clean_str @@ -13,8 +15,13 @@ from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): def __init__( - self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs - ): + self, + api: ApiClient, + issn_map_file: Sequence, + dblp_container_map_file: Sequence, + dblp_container_map_output: Any, + **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -29,7 +36,7 @@ class DblpContainerImporter(EntityImporter): self.read_issn_map_file(issn_map_file) print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output) - def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + def read_dblp_container_map_file(self, dblp_container_map_file: Sequence) -> None: self._dblp_container_map = dict() print("Loading existing dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: @@ -44,15 +51,15 @@ class DblpContainerImporter(EntityImporter): file=sys.stderr, ) - def lookup_dblp_prefix(self, prefix): + def lookup_dblp_prefix(self, prefix: str) -> Optional[str]: if not prefix: return None return self._dblp_container_map.get(prefix) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -77,7 +84,7 @@ class DblpContainerImporter(EntityImporter): if issnl: break - extra = { + extra: Dict[str, Any] = { "dblp": { "prefix": dblp_prefix, }, @@ -98,7 +105,7 @@ class DblpContainerImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: dblp_prefix = ce.extra["dblp"]["prefix"] existing = None @@ -135,7 +142,7 @@ class DblpContainerImporter(EntityImporter): # shouldn't get here raise NotImplementedError() - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: """ Because we want to print a prefix/container_id match for each row, we require a special batch insert method diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index e73e5f33..cb56432a 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -24,10 +24,11 @@ import datetime import json import sys # noqa: F401 import warnings -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional, Sequence import bs4 import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import ( @@ -44,7 +45,9 @@ from fatcat_tools.transforms import entity_to_dict class DblpReleaseImporter(EntityImporter): - def __init__(self, api, dblp_container_map_file=None, **kwargs): + def __init__( + self, api: ApiClient, dblp_container_map_file: Optional[Sequence] = None, **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", "Automated import of dblp metadata via XML records" @@ -70,7 +73,7 @@ class DblpReleaseImporter(EntityImporter): # "data", # no instances in 2020-11 dump ] - def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + def read_dblp_container_map_file(self, dblp_container_map_file: Optional[Sequence]) -> None: self._dblp_container_map = dict() if not dblp_container_map_file: print( @@ -91,12 +94,12 @@ class DblpReleaseImporter(EntityImporter): file=sys.stderr, ) - def lookup_dblp_prefix(self, prefix): + def lookup_dblp_prefix(self, prefix: Optional[str]) -> Optional[str]: if not prefix: return None return self._dblp_container_map.get(prefix) - def want(self, xml_elem): + def want(self, xml_elem: Any) -> bool: if xml_elem.name not in self.ELEMENT_TYPES: self.counts["skip-type"] += 1 return False @@ -108,7 +111,8 @@ class DblpReleaseImporter(EntityImporter): return False return True - def parse_record(self, xml_elem): + # TODO: xml_elem could be typed instead of 'Any' for better type checking + def parse_record(self, xml_elem: Any) -> Optional[ReleaseEntity]: """ - title => may contain <i>, <sub>, <sup>, <tt> @@ -255,7 +259,7 @@ class DblpReleaseImporter(EntityImporter): dblp_extra["part_of_key"] = part_of_key # generic extra - extra = dict() + extra: Dict[str, Any] = dict() if not container_id and container_name: extra["container_name"] = container_name @@ -312,14 +316,14 @@ class DblpReleaseImporter(EntityImporter): return re @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing release by dblp article id existing = None @@ -411,7 +415,7 @@ class DblpReleaseImporter(EntityImporter): return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 56045ea7..9ff4f3fb 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -6,9 +6,10 @@ DOAJ API schema and docs: https://doaj.org/api/v1/docs import datetime import warnings -from typing import List, Optional +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import ( @@ -28,7 +29,7 @@ MAX_ABSTRACT_LENGTH = 2048 class DoajArticleImporter(EntityImporter): - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -49,10 +50,10 @@ class DoajArticleImporter(EntityImporter): self.this_year = datetime.datetime.now().year self.read_issn_map_file(issn_map_file) - def want(self, obj): + def want(self, raw_record: Dict[str, Any]) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ bibjson { abstract (string, optional), @@ -108,7 +109,7 @@ class DoajArticleImporter(EntityImporter): publisher = clean_str(bibjson["journal"].get("publisher")) try: - release_year = int(bibjson.get("year")) + release_year: Optional[int] = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None release_month = parse_month(clean_str(bibjson.get("month"))) @@ -148,7 +149,7 @@ class DoajArticleImporter(EntityImporter): contribs = self.doaj_contribs(bibjson.get("author") or []) # DOAJ-specific extra - doaj_extra = dict() + doaj_extra: Dict[str, Any] = dict() if bibjson.get("subject"): doaj_extra["subject"] = bibjson.get("subject") if bibjson.get("keywords"): @@ -157,7 +158,7 @@ class DoajArticleImporter(EntityImporter): ] # generic extra - extra = dict() + extra: Dict[str, Any] = dict() if country: extra["country"] = country if not container_id and container_name: @@ -194,14 +195,14 @@ class DoajArticleImporter(EntityImporter): return re @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing release by DOAJ article id existing = None @@ -276,7 +277,7 @@ class DoajArticleImporter(EntityImporter): return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 26584ff3..892c1dcd 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,4 +1,7 @@ +from typing import Any, Dict + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from .common import EntityImporter @@ -14,7 +17,7 @@ class FileMetaImporter(EntityImporter): imported which were missing file size, mimetype, md5, and/or sha256. """ - def __init__(self, api, require_grobid=True, **kwargs): + def __init__(self, api: ApiClient, require_grobid: bool = True, **kwargs): eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates" eg_extra = kwargs.pop("editgroup_extra", dict()) @@ -22,14 +25,14 @@ class FileMetaImporter(EntityImporter): kwargs["do_updates"] = kwargs.get("do_updates", True) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Any) -> bool: for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"): if not row.get(k): self.counts["skip-missing-field"] += 1 return False return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FileEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False @@ -44,7 +47,7 @@ class FileMetaImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index dd8f5600..2207b938 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FilesetEntity from fatcat_tools import entity_from_dict @@ -17,7 +20,7 @@ class FilesetImporter(EntityImporter): Currently only creates (insert), no updates. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import" eg_extra = kwargs.pop("editgroup_extra", dict()) @@ -29,7 +32,7 @@ class FilesetImporter(EntityImporter): # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not row.get("release_ids"): self.counts["skip-no-release-ids"] += 1 return False @@ -47,7 +50,7 @@ class FilesetImporter(EntityImporter): return False return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[FilesetEntity]: fse = entity_from_dict( row, @@ -57,7 +60,7 @@ class FilesetImporter(EntityImporter): fse = self.generic_fileset_cleanups(fse) return fse - def try_update(self, fse): + def try_update(self, fse: FilesetEntity) -> bool: if not self.skip_release_fileset_check: for release_id in fse.release_ids: @@ -74,7 +77,7 @@ class FilesetImporter(EntityImporter): # do the insert return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FilesetEntity]) -> None: self.api.create_fileset_auto_batch( fatcat_openapi_client.FilesetAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index f7bb5357..830c9bbb 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -2,8 +2,10 @@ import base64 import json +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity from .common import EntityImporter, clean, make_rel_url @@ -22,7 +24,7 @@ class GrobidMetadataImporter(EntityImporter): TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -34,10 +36,10 @@ class GrobidMetadataImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, row): + def parse_record(self, row: str) -> Optional[FileEntity]: fields = row.split("\t") sha1_key = fields[0] @@ -72,12 +74,12 @@ class GrobidMetadataImporter(EntityImporter): fe.release_ids.append(release_edit.ident) return fe - def parse_grobid_json(self, obj): + def parse_grobid_json(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: if not obj.get("title"): return None - extra_grobid = dict() + extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: @@ -103,7 +105,7 @@ class GrobidMetadataImporter(EntityImporter): refs = [] for raw in obj.get("citations", []): - cite_extra = dict() + cite_extra: Dict[str, Any] = dict() year = None if raw.get("date"): try: @@ -162,13 +164,15 @@ class GrobidMetadataImporter(EntityImporter): publisher=clean(obj["journal"].get("publisher")), volume=clean(obj["journal"].get("volume")), issue=clean(obj["journal"].get("issue")), - abstracts=abstracts, + abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), - extra=extra, + extra=extra or None, ) return re - def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): + def parse_file_metadata( + self, sha1_key: str, cdx: Dict[str, Any], mimetype: str, file_size: int + ) -> FileEntity: sha1 = ( base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) @@ -197,11 +201,11 @@ class GrobidMetadataImporter(EntityImporter): return fe - def try_update(self, entity): + def try_update(self, re: FileEntity) -> bool: # did the exists check in 'parse_record()', because we needed to create a release return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index e0a6c3f5..e13ce4bd 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,12 +1,23 @@ import datetime +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ( + ApiClient, + FileEntity, + FilesetEntity, + FilesetUrl, + FileUrl, + WebcaptureEntity, +) from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): - def __init__(self, api, require_grobid=True, **kwargs): + def __init__( + self, api: fatcat_openapi_client.ApiClient, require_grobid: bool = True, **kwargs + ) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -41,7 +52,7 @@ class IngestFileResultImporter(EntityImporter): if kwargs.get("skip_source_allowlist", False): self.ingest_request_source_allowlist = [] - def want_file(self, row) -> bool: + def want_file(self, row: Dict[str, Any]) -> bool: """ File-specific part of want(). Generic across general ingest and save-paper-now. """ @@ -76,7 +87,7 @@ class IngestFileResultImporter(EntityImporter): return True - def want_ingest(self, row) -> bool: + def want_ingest(self, row: Dict[str, Any]) -> bool: """ Sandcrawler ingest-specific part of want(). Generic across file and webcapture ingest. @@ -115,7 +126,7 @@ class IngestFileResultImporter(EntityImporter): return True - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: """ Overall logic here probably needs work (TODO): @@ -137,7 +148,7 @@ class IngestFileResultImporter(EntityImporter): return True - def parse_ingest_release_ident(self, row): + def parse_ingest_release_ident(self, row: Dict[str, Any]) -> Optional[str]: request = row["request"] fatcat = request.get("fatcat") @@ -178,7 +189,7 @@ class IngestFileResultImporter(EntityImporter): return release_ident - def parse_terminal(self, row): + def parse_terminal(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: terminal = row.get("terminal") if not terminal: # support old cdx-only ingest results @@ -206,7 +217,7 @@ class IngestFileResultImporter(EntityImporter): ) return terminal - def parse_urls(self, row, terminal): + def parse_urls(self, row: Dict[str, Any], terminal: Dict[str, Any]) -> List[FileUrl]: request = row["request"] @@ -224,10 +235,10 @@ class IngestFileResultImporter(EntityImporter): ) urls = [url, ("webarchive", wayback)] - urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + urls = [FileUrl(rel=rel, url=url) for (rel, url) in urls] return urls - def parse_edit_extra(self, row): + def parse_edit_extra(self, row: Dict[str, Any]) -> Dict[str, Any]: request = row["request"] edit_extra = dict() @@ -251,7 +262,7 @@ class IngestFileResultImporter(EntityImporter): return edit_extra - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FileEntity: request = row["request"] file_meta = row["file_meta"] @@ -283,7 +294,7 @@ class IngestFileResultImporter(EntityImporter): urls = self.parse_urls(row, terminal) - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=file_meta["md5hex"], sha1=file_meta["sha1hex"], sha256=file_meta["sha256hex"], @@ -298,7 +309,7 @@ class IngestFileResultImporter(EntityImporter): fe.edit_extra = edit_extra return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -330,7 +341,7 @@ class IngestFileResultImporter(EntityImporter): self.counts["skip-update-disabled"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -358,7 +369,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): them for further human review (as opposed to accepting by default). """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -371,7 +382,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: source = row["request"].get("ingest_request_source") if not source: @@ -397,7 +408,7 @@ class IngestWebResultImporter(IngestFileResultImporter): into webcapture objects. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -408,7 +419,7 @@ class IngestWebResultImporter(IngestFileResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not self.want_ingest(row): return False @@ -426,7 +437,7 @@ class IngestWebResultImporter(IngestFileResultImporter): return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[WebcaptureEntity]: request = row["request"] file_meta = row["file_meta"] @@ -512,7 +523,7 @@ class IngestWebResultImporter(IngestFileResultImporter): wc.edit_extra = edit_extra return wc - def try_update(self, wc): + def try_update(self, wc: WebcaptureEntity) -> bool: # check for existing edits-in-progress with same URL for other in self._entity_queue: @@ -539,7 +550,7 @@ class IngestWebResultImporter(IngestFileResultImporter): # so go ahead and insert! return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[WebcaptureEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -565,7 +576,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter): Like SavePaperNowFileImporter, but for webcapture (HTML) ingest. """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -577,7 +588,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: """ Relatively custom want() here, a synthesis of other filters. @@ -617,7 +628,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): results into fileset objects. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -629,7 +640,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.max_file_count = 300 - def want_fileset(self, row): + def want_fileset(self, row: Dict[str, Any]) -> bool: if not row.get("manifest") or len(row.get("manifest")) == 0: self.counts["skip-empty-manifest"] += 1 @@ -645,7 +656,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not self.want_ingest(row): return False @@ -662,7 +673,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def parse_fileset_urls(self, row): + def parse_fileset_urls(self, row: Dict[str, Any]) -> List[FilesetUrl]: if not row.get("strategy"): return [] strategy = row["strategy"] @@ -717,7 +728,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): ) return urls - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FilesetEntity: request = row["request"] @@ -735,7 +746,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): self.counts["skip-release-not-found"] += 1 return None - entity_extra = dict() + entity_extra: Dict[str, Any] = dict() edit_extra = self.parse_edit_extra(row) edit_extra["ingest_strategy"] = row["ingest_strategy"] if row.get("platform"): @@ -789,12 +800,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter): fe.edit_extra = edit_extra return fe - def try_update(self, wc): + def try_update(self, fse: FilesetEntity) -> bool: # check for existing edits-in-progress with same URL for other in self._entity_queue: # XXX: how to duplicate check? - if other.original_url == wc.original_url: + if other.original_url == fse.original_url: self.counts["skip-in-queue"] += 1 return False @@ -802,12 +813,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter): # existing = None # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release - release = self.api.get_release(wc.release_ids[0], expand="filesets") + release = self.api.get_release(fse.release_ids[0], expand="filesets") if release.filesets: # XXX: how to duplicate check filesets? # check if this is an existing match, or just a similar hit for other in release.filesets: - if wc.original_url == other.original_url: + if fse.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) self.counts["exists"] += 1 return False @@ -816,7 +827,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FilesetEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -842,7 +853,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): Like SavePaperNowFileImporter, but for fileset/dataset ingest. """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -854,7 +865,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: source = row["request"].get("ingest_request_source") if not source: diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index a7e06e6a..f540c264 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,16 +1,19 @@ import datetime import sqlite3 import sys +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from fatcat_tools.normal import clean_doi from .common import DATE_FMT, EntityImporter, clean, is_cjk -def parse_jalc_persons(raw_persons): +# TODO: should be List[Tag] not List[Any] for full type annotations +def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: """ For the most part, JALC DC names are in either japanese or english. The two common patterns are a list alternating between the two (in which case @@ -47,7 +50,7 @@ def parse_jalc_persons(raw_persons): if lang == "en" and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) - rc = fatcat_openapi_client.ReleaseContrib( + rc = ReleaseContrib( raw_name=name, surname=surname, given_name=given_name, role="author" ) # add an extra hint field; won't end up in serialized object @@ -100,7 +103,7 @@ class JalcImporter(EntityImporter): NOTE: some JALC DOIs seem to get cross-registered with Crossref """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata") eg_extra = kwargs.get("editgroup_extra", dict()) @@ -125,7 +128,7 @@ class JalcImporter(EntityImporter): self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi): + def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: if self.extid_map_db is None: return dict( core_id=None, @@ -158,10 +161,12 @@ class JalcImporter(EntityImporter): jstor_id=None, ) - def want(self, obj): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, record): + # TODO: mypy annotations partially skipped on this function ('Any' instead of 'Tag') + # for now because # XML # parsing # annotations are large and complex + def parse_record(self, record: Any) -> Optional[ReleaseEntity]: """ record is a beautiful soup object returns a ReleaseEntity, or None @@ -170,8 +175,8 @@ class JalcImporter(EntityImporter): fields. """ - extra = dict() - extra_jalc = dict() + extra: Dict[str, Any] = dict() + extra_jalc: Dict[str, Any] = dict() titles = record.find_all("title") if not titles: @@ -254,7 +259,7 @@ class JalcImporter(EntityImporter): publisher = None container_name = None - container_extra = dict() + container_extra: Dict[str, Any] = dict() if record.publicationName: pubs = [ @@ -335,7 +340,7 @@ class JalcImporter(EntityImporter): if not title: return None - re = fatcat_openapi_client.ReleaseEntity( + re = ReleaseEntity( work_id=None, title=title, original_title=clean(original_title), @@ -364,7 +369,7 @@ class JalcImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing DOI existing = None @@ -384,7 +389,7 @@ class JalcImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -394,7 +399,7 @@ class JalcImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: """ Helper for testing; can run this file stand-alone instead of using a pusher """ @@ -408,4 +413,3 @@ class JalcImporter(EntityImporter): # print(json.dumps(resp)) print(resp) # sys.exit(-1) - diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 6d1fefa3..a45e49f3 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,9 +1,12 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from .common import EntityImporter, clean -def or_none(s): +def or_none(s: Optional[str]) -> Optional[str]: if s is None: return None if len(s) == 0: @@ -11,7 +14,7 @@ def or_none(s): return s -def truthy(s): +def truthy(s: Optional[str]) -> Optional[bool]: if s is None: return None s = s.lower() @@ -32,7 +35,7 @@ class JournalMetadataImporter(EntityImporter): See guide for details on the many 'extra' fields used here. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -42,12 +45,12 @@ class JournalMetadataImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: if raw_record.get("issnl") and raw_record.get("name"): return True return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -106,7 +109,7 @@ class JournalMetadataImporter(EntityImporter): if not name: return None - ce = fatcat_openapi_client.ContainerEntity( + ce = ContainerEntity( issnl=row["issnl"], issne=row.get("issne"), issnp=row.get("issnp"), @@ -118,7 +121,7 @@ class JournalMetadataImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: existing = None try: @@ -148,7 +151,7 @@ class JournalMetadataImporter(EntityImporter): # if we got this far, it's a bug raise NotImplementedError - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 287fb308..0a6eec65 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -2,9 +2,11 @@ import datetime import json import sys import warnings +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP @@ -32,7 +34,7 @@ class JstorImporter(EntityImporter): Collection) """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata") eg_extra = kwargs.get("editgroup_extra", dict()) @@ -49,19 +51,22 @@ class JstorImporter(EntityImporter): self.read_issn_map_file(issn_map_file) - def map_container_type(self, crossref_type): + def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]: return CONTAINER_TYPE_MAP.get(crossref_type) - def want(self, obj): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, article): + # TODO: mypy annotations partially skipped on this function ('Any' instead of + # 'BeautifulSoup') for now because XML parsing annotations are large and + # complex + def parse_record(self, article: Any) -> Optional[ReleaseEntity]: journal_meta = article.front.find("journal-meta") article_meta = article.front.find("article-meta") - extra = dict() - extra_jstor = dict() + extra: Dict[str, Any] = dict() + extra_jstor: Dict[str, Any] = dict() release_type = JSTOR_TYPE_MAP.get(article["article-type"]) title = article_meta.find("article-title") @@ -269,7 +274,7 @@ class JstorImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # first, lookup existing by JSTOR id (which much be defined) existing = None @@ -313,7 +318,7 @@ class JstorImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -323,7 +328,7 @@ class JstorImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 7c2a6a87..9c80dd72 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from fatcat_tools.normal import clean_doi @@ -29,7 +32,7 @@ class MatchedImporter(EntityImporter): - core_id, wikidata_id, pmcid, pmid: not as lists """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -41,10 +44,10 @@ class MatchedImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mimetype = kwargs.get("default_mimetype", None) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]: dois = [d.lower() for d in obj.get("dois", [])] # lookup dois @@ -129,7 +132,7 @@ class MatchedImporter(EntityImporter): if urls[0].url.endswith(".pdf"): mimetype = "application/pdf" - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=obj.get("md5"), sha1=obj["sha1"], sha256=obj.get("sha256"), @@ -140,7 +143,7 @@ class MatchedImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -207,7 +210,7 @@ class MatchedImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index b514e6e5..430cdd0f 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,11 +1,13 @@ import sys +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, CreatorEntity from .common import EntityImporter, clean -def value_or_none(e): +def value_or_none(e: Any) -> Any: if type(e) == dict: e = e.get("value") if type(e) == str and len(e) == 0: @@ -22,7 +24,7 @@ def value_or_none(e): class OrcidImporter(EntityImporter): - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -32,10 +34,10 @@ class OrcidImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[CreatorEntity]: """ obj is a python dict (parsed from json). returns a CreatorEntity @@ -67,7 +69,7 @@ class OrcidImporter(EntityImporter): if not display: # must have *some* name return None - ce = fatcat_openapi_client.CreatorEntity( + ce = CreatorEntity( orcid=orcid, given_name=clean(given), surname=clean(sur), @@ -76,10 +78,10 @@ class OrcidImporter(EntityImporter): ) return ce - def try_update(self, raw_record): + def try_update(self, ce: CreatorEntity) -> bool: existing = None try: - existing = self.api.lookup_creator(orcid=raw_record.orcid) + existing = self.api.lookup_creator(orcid=ce.orcid) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err @@ -92,7 +94,7 @@ class OrcidImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[CreatorEntity]) -> None: self.api.create_creator_auto_batch( fatcat_openapi_client.CreatorAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 97433445..41268925 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -2,9 +2,11 @@ import datetime import json import sys import warnings +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid @@ -328,7 +330,9 @@ class PubmedImporter(EntityImporter): TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ - def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): + def __init__( + self, api: ApiClient, issn_map_file: Sequence, lookup_refs: bool = True, **kwargs + ): eg_desc = kwargs.get( "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" @@ -347,10 +351,13 @@ class PubmedImporter(EntityImporter): self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) - def want(self, obj): + def want(self, raw_record: BeautifulSoup) -> bool: return True - def parse_record(self, a): + # TODO: mypy annotations partially skipped on this function ('Any' instead of + # 'BeautifulSoup') for now because XML parsing annotations are large and + # complex + def parse_record(self, a: Any) -> ReleaseEntity: medline = a.MedlineCitation # PubmedData isn't required by DTD, but seems to always be present @@ -482,8 +489,8 @@ class PubmedImporter(EntityImporter): pub_date = journal.PubDate if not pub_date: pub_date = journal.JournalIssue.PubDate - release_date = None - release_year = None + release_date: Optional[str] = None + release_year: Optional[int] = None if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): @@ -578,7 +585,7 @@ class PubmedImporter(EntityImporter): abstracts.append(abst) other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: - lang = "en" + lang: Optional[str] = "en" if other.get("Language"): lang = LANG_MAP_MARC.get(other["Language"]) abst = fatcat_openapi_client.ReleaseAbstract( @@ -666,7 +673,7 @@ class PubmedImporter(EntityImporter): # that there may be multiple ReferenceList (eg, sometimes one per # Reference) for ref in pubmed.find_all("Reference"): - ref_extra = dict() + ref_extra: Dict[str, Any] = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: ref_doi = clean_doi(ref_doi.string) @@ -740,7 +747,7 @@ class PubmedImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # first, lookup existing by PMID (which must be defined) existing = None @@ -831,7 +838,7 @@ class PubmedImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -841,7 +848,7 @@ class PubmedImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 78eeec7a..520258cb 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid @@ -27,7 +30,7 @@ class ShadowLibraryImporter(EntityImporter): - datetime """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -38,7 +41,7 @@ class ShadowLibraryImporter(EntityImporter): super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: """ Only want to import records with complete file-level metadata """ @@ -51,7 +54,7 @@ class ShadowLibraryImporter(EntityImporter): return False return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]: """ We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ @@ -104,7 +107,7 @@ class ShadowLibraryImporter(EntityImporter): urls.append(("webarchive", wayback)) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=obj["file_meta"]["md5hex"], sha1=obj["file_meta"]["sha1hex"], sha256=obj["file_meta"]["sha256hex"], @@ -116,7 +119,7 @@ class ShadowLibraryImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> Optional[bool]: # lookup sha1, or create new entity existing = None try: @@ -189,7 +192,7 @@ class ShadowLibraryImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 22fefad3..f9ee29c9 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -12,12 +12,14 @@ import hashlib import json import subprocess import sys +from typing import Any, Dict, List, Optional, Tuple import requests from bs4 import BeautifulSoup from fatcat_openapi_client import ( ApiClient, Editgroup, + EntityEdit, WebcaptureCdxLine, WebcaptureEntity, WebcaptureUrl, @@ -30,7 +32,7 @@ GWB_URL_BASE = "https://web.archive.org/web" REQ_SESSION = requests.Session() -def parse_wbm_url(url): +def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: """Takes a wayback machine URL, and returns a tuple: (timestamp, datetime, original_url) @@ -42,7 +44,7 @@ def parse_wbm_url(url): return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) -def test_parse_wbm_url(): +def test_parse_wbm_url() -> None: u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" assert parse_wbm_url(u) == ( "20010712114837", @@ -51,7 +53,7 @@ def test_parse_wbm_url(): ) -def parse_wbm_timestamp(timestamp): +def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: """ Takes a complete WBM timestamp string (like "20020327115625") and returns a python datetime object (UTC) @@ -71,18 +73,20 @@ def parse_wbm_timestamp(timestamp): ) -def test_parse_wbm_timestamp(): +def test_parse_wbm_timestamp() -> None: assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) -def fetch_wbm(url): +def fetch_wbm(url: str) -> bytes: resp = REQ_SESSION.get(url) resp.raise_for_status() assert resp.content return resp.content -def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): +def lookup_cdx( + embed_url: str, verify_hashes: bool = True, cdx_output: Any = None +) -> Optional[WebcaptureCdxLine]: sys.stderr.write(embed_url + "\n") assert embed_url.startswith("/web/") embed_url = embed_url.split("/") @@ -132,7 +136,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): return None -def wayback_url_to_relative(url): +def wayback_url_to_relative(url: str) -> Optional[str]: """ Wayback URLs can be relative or absolute in rewritten documents. This function converts any form of rewritten URL to a relative (to @@ -149,7 +153,7 @@ def wayback_url_to_relative(url): return None -def extract_embeds(soup): +def extract_embeds(soup: BeautifulSoup) -> List[str]: embeds = set() @@ -175,7 +179,7 @@ def extract_embeds(soup): return list(embeds) -def static_wayback_webcapture(wayback_url, cdx_output=None): +def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: """ Given a complete wayback machine capture URL, like: @@ -214,7 +218,9 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): return wc -def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): +def auto_wayback_static( + api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None +) -> Tuple[Optional[str], Optional[EntityEdit]]: """ Returns a tuple: (editgroup_id, edit). If failed, both are None """ @@ -250,7 +256,7 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): return (editgroup_id, edit) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--verbose", action="store_true", help="verbose output") parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") |