diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 12:57:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 16:46:07 -0700 |
commit | caf1cb316ed18820f3239a285ef14bf45ef963a2 (patch) | |
tree | 2d3713773dac769878154f61c2eb9f7804f1a60c /python | |
parent | 10a2374051568edf3d872988e730328d899a0fdd (diff) | |
download | fatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.tar.gz fatcat-caf1cb316ed18820f3239a285ef14bf45ef963a2.zip |
typing: initial annotations on importers
This commit just adds the type annotations, doesn't do fixes to code to
make type checking pass.
Diffstat (limited to 'python')
22 files changed, 443 insertions, 274 deletions
diff --git a/python/fatcat_tools/importers/arabesque.py b/python/fatcat_tools/importers/arabesque.py index ae4f9049..2fb7be55 100644 --- a/python/fatcat_tools/importers/arabesque.py +++ b/python/fatcat_tools/importers/arabesque.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from .common import SANE_MAX_RELEASES, SANE_MAX_URLS, EntityImporter, b32_hex, make_rel_url @@ -36,7 +39,9 @@ class ArabesqueMatchImporter(EntityImporter): - a mode to insert bare files even if identifier not known? """ - def __init__(self, api, extid_type, require_grobid=True, **kwargs): + def __init__( + self, api: ApiClient, extid_type: str, require_grobid: bool = True, **kwargs + ) -> None: eg_desc = ( kwargs.get("editgroup_description", None) @@ -59,7 +64,7 @@ class ArabesqueMatchImporter(EntityImporter): else: print("NOT checking GROBID status column") - def want(self, row): + def want(self, row: Any) -> bool: if self.require_grobid and not row["postproc_status"] == "200": return False if ( @@ -76,7 +81,7 @@ class ArabesqueMatchImporter(EntityImporter): else: return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[FileEntity]: extid = row["identifier"].strip() @@ -131,7 +136,7 @@ class ArabesqueMatchImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -182,7 +187,7 @@ class ArabesqueMatchImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index 0957db2c..1d50dd9a 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -2,9 +2,11 @@ import datetime import json import re import sys +from typing import Any, Dict, List, Optional import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from pylatexenc.latex2text import LatexNodes2Text from .common import EntityImporter @@ -13,7 +15,7 @@ from .crossref import lookup_license_slug latex2text = LatexNodes2Text() -def latex_to_text(raw): +def latex_to_text(raw: str) -> str: try: return latex2text.latex_to_text(raw).strip() except AttributeError: @@ -22,7 +24,7 @@ def latex_to_text(raw): return raw.strip() -def parse_arxiv_authors(raw): +def parse_arxiv_authors(raw: str) -> List[str]: if not raw: return [] raw = raw.replace("*", "") @@ -41,7 +43,7 @@ def parse_arxiv_authors(raw): return authors -def test_parse_arxiv_authors(): +def test_parse_arxiv_authors() -> None: assert parse_arxiv_authors( "Raphael Chetrite, Shamik Gupta, Izaak Neri and \\'Edgar Rold\\'an" @@ -88,7 +90,7 @@ class ArxivRawImporter(EntityImporter): the "most recent" version; can be a simple sort? """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -107,15 +109,17 @@ class ArxivRawImporter(EntityImporter): ) self._test_override = False - def parse_record(self, record): + # TODO: record is really a beautiful soup element, but setting to 'Any' to + # make initial type annotations simple + def parse_record(self, record: Any) -> Optional[List[ReleaseEntity]]: if not record: return None metadata = record.arXivRaw if not metadata: return None - extra = dict() - extra_arxiv = dict() + extra: Dict[str, Any] = dict() + extra_arxiv: Dict[str, Any] = dict() # don't know! release_type = "article" @@ -134,7 +138,7 @@ class ArxivRawImporter(EntityImporter): for i, a in enumerate(authors) ] - lang = "en" # the vast majority in english + lang: Optional[str] = "en" # the vast majority in english if metadata.comments and metadata.comments.get_text(): comments = metadata.comments.get_text().replace("\n", " ").strip() extra_arxiv["comments"] = comments @@ -229,7 +233,7 @@ class ArxivRawImporter(EntityImporter): ).date() # TODO: source_type? versions.append( - fatcat_openapi_client.ReleaseEntity( + ReleaseEntity( work_id=None, title=title, # original_title @@ -261,7 +265,7 @@ class ArxivRawImporter(EntityImporter): versions[-1].release_stage = "accepted" return versions - def try_update(self, versions): + def try_update(self, versions: List[ReleaseEntity]) -> bool: """ This is pretty complex! There is no batch/bezerk mode for arxiv importer. @@ -344,7 +348,7 @@ class ArxivRawImporter(EntityImporter): return False - def insert_batch(self, batch_batch): + def insert_batch(self, batch_batch: List[ReleaseEntity]) -> None: # there is no batch/bezerk mode for arxiv importer, except for testing if self._test_override: for batch in batch_batch: @@ -360,7 +364,7 @@ class ArxivRawImporter(EntityImporter): else: raise NotImplementedError() - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/cdl_dash_dat.py b/python/fatcat_tools/importers/cdl_dash_dat.py index e9de42fc..b88117e0 100755 --- a/python/fatcat_tools/importers/cdl_dash_dat.py +++ b/python/fatcat_tools/importers/cdl_dash_dat.py @@ -7,10 +7,13 @@ import os import subprocess import sys import urllib +import urllib.parse +from typing import Any, Dict, List, Optional, Tuple import fatcat_openapi_client import magic from fatcat_openapi_client import ( + ApiClient, Editgroup, FilesetEntity, FilesetFile, @@ -24,7 +27,7 @@ from .common import clean from .crossref import lookup_license_slug -def single_file(prefix, path): +def single_file(prefix: str, path: str) -> FilesetFile: full = prefix + path size_bytes = os.stat(full).st_size @@ -59,7 +62,7 @@ def single_file(prefix, path): return fsf -def make_manifest(base_dir): +def make_manifest(base_dir: str) -> List[FilesetFile]: manifest = [] for root, dirs, files in os.walk(base_dir): for f in files: @@ -67,7 +70,9 @@ def make_manifest(base_dir): return manifest -def cdl_dash_release(meta, extra=None): +def cdl_dash_release( + meta: Dict[str, Any], extra: Optional[Dict[str, Any]] = None +) -> ReleaseEntity: if not extra: extra = dict() @@ -124,7 +129,7 @@ def cdl_dash_release(meta, extra=None): return r -def make_release_fileset(dat_path): +def make_release_fileset(dat_path: str) -> Tuple[ReleaseEntity, FilesetEntity]: if dat_path.endswith("/"): dat_path = dat_path[:-1] @@ -170,7 +175,12 @@ def make_release_fileset(dat_path): return (release, fs) -def auto_cdl_dash_dat(api, dat_path, release_id=None, editgroup_id=None): +def auto_cdl_dash_dat( + api: ApiClient, + dat_path: str, + release_id: Optional[str] = None, + editgroup_id: Optional[str] = None, +) -> Tuple[Optional[str], Optional[ReleaseEntity], Optional[FilesetEntity]]: git_rev = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8") diff --git a/python/fatcat_tools/importers/chocula.py b/python/fatcat_tools/importers/chocula.py index 8d2a89b6..842c7853 100644 --- a/python/fatcat_tools/importers/chocula.py +++ b/python/fatcat_tools/importers/chocula.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from .common import EntityImporter, clean @@ -12,7 +15,7 @@ class ChoculaImporter(EntityImporter): See guide for details on the many 'extra' fields used here. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -22,7 +25,7 @@ class ChoculaImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.ChoculaImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: if not raw_record.get("ident") and not raw_record.get("_known_issnl"): self.counts["skip-unknown-new-issnl"] += 1 return False @@ -30,7 +33,7 @@ class ChoculaImporter(EntityImporter): return True return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -75,7 +78,7 @@ class ChoculaImporter(EntityImporter): elif "journal " in name.lower(): container_type = "journal" - ce = fatcat_openapi_client.ContainerEntity( + ce = ContainerEntity( issnl=row["issnl"], issnp=row["extra"].get("issnp"), issne=row["extra"].get("issne"), @@ -88,7 +91,7 @@ class ChoculaImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: existing = None if ce.ident: @@ -193,7 +196,7 @@ class ChoculaImporter(EntityImporter): # if we got this far, it's a bug raise NotImplementedError - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 0b68e5fe..fd472d11 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -7,7 +7,7 @@ import subprocess import sys import xml.etree.ElementTree as ET from collections import Counter -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Sequence, Tuple import elasticsearch import fatcat_openapi_client @@ -16,7 +16,14 @@ import fuzzycat.verify import lxml from bs4 import BeautifulSoup from confluent_kafka import Consumer, KafkaException -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ( + ApiClient, + ContainerEntity, + EntityEdit, + FileEntity, + FilesetEntity, + ReleaseEntity, +) from fatcat_openapi_client.rest import ApiException from fuzzycat.matching import match_release_fuzzy @@ -90,7 +97,7 @@ DOMAIN_REL_MAP: Dict[str, str] = { } -def make_rel_url(raw_url: str, default_link_rel: str = "web"): +def make_rel_url(raw_url: str, default_link_rel: str = "web") -> Tuple[str, str]: # this is where we map specific domains to rel types, and also filter out # bad domains, invalid URLs, etc rel = default_link_rel @@ -101,7 +108,7 @@ def make_rel_url(raw_url: str, default_link_rel: str = "web"): return (rel, raw_url) -def test_make_rel_url(): +def test_make_rel_url() -> None: assert make_rel_url("http://example.com/thing.pdf")[0] == "web" assert make_rel_url("http://example.com/thing.pdf", default_link_rel="jeans")[0] == "jeans" assert ( @@ -145,7 +152,7 @@ class EntityImporter: implementors must write insert_batch appropriately """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_extra = kwargs.get("editgroup_extra", dict()) eg_extra["git_rev"] = eg_extra.get( @@ -212,7 +219,7 @@ class EntityImporter: # implementations should fill this in raise NotImplementedError - def finish(self): + def finish(self) -> Counter: """ Gets called as cleanup at the end of imports, but can also be called at any time to "snip off" current editgroup progress. In other words, safe @@ -238,7 +245,7 @@ class EntityImporter: return self.counts - def get_editgroup_id(self, edits=1): + def get_editgroup_id(self, edits: int = 1) -> str: if self._edit_count >= self.edit_batch_size: if self.submit_mode: self.api.submit_editgroup(self._editgroup_id) @@ -257,30 +264,31 @@ class EntityImporter: self._editgroup_id = eg.editgroup_id self._edit_count += edits + assert self._editgroup_id return self._editgroup_id - def create_container(self, entity): + def create_container(self, entity: ContainerEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.container"] += 1 return self.api.create_container(eg_id, entity) - def create_release(self, entity): + def create_release(self, entity: ReleaseEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.release"] += 1 return self.api.create_release(eg_id, entity) - def create_file(self, entity): + def create_file(self, entity: FileEntity) -> EntityEdit: eg_id = self.get_editgroup_id() self.counts["inserted.file"] += 1 return self.api.create_file(eg_id, entity) - def updated(self): + def updated(self) -> None: """ Implementations should call this from try_update() if the update was successful """ self.counts["update"] += 1 - def push_entity(self, entity): + def push_entity(self, entity: Any) -> None: self._entity_queue.append(entity) if len(self._entity_queue) >= self.edit_batch_size: self.insert_batch(self._entity_queue) @@ -294,7 +302,7 @@ class EntityImporter: """ return True - def try_update(self, raw_record): + def try_update(self, raw_record: Any) -> Optional[bool]: """ Passed the output of parse_record(). Should try to find an existing entity and update it (PUT), decide we should do nothing (based on the @@ -307,15 +315,17 @@ class EntityImporter: """ raise NotImplementedError - def insert_batch(self, raw_records: List[Any]): + def insert_batch(self, raw_records: List[Any]) -> None: raise NotImplementedError def is_orcid(self, orcid: str) -> bool: # TODO: replace with clean_orcid() from fatcat_tools.normal return self._orcid_regex.match(orcid) is not None - def lookup_orcid(self, orcid: str): - """Caches calls to the Orcid lookup API endpoint in a local dict""" + def lookup_orcid(self, orcid: str) -> Optional[str]: + """Caches calls to the Orcid lookup API endpoint in a local dict. + + Returns a creator fatcat ident if found, else None""" if not self.is_orcid(orcid): return None if orcid in self._orcid_id_map: @@ -335,7 +345,7 @@ class EntityImporter: # TODO: replace with clean_doi() from fatcat_tools.normal return doi.startswith("10.") and doi.count("/") >= 1 - def lookup_doi(self, doi: str): + def lookup_doi(self, doi: str) -> Optional[str]: """Caches calls to the doi lookup API endpoint in a local dict For identifier lookups only (not full object fetches)""" @@ -354,7 +364,7 @@ class EntityImporter: self._doi_id_map[doi] = release_id # might be None return release_id - def lookup_pmid(self, pmid: str): + def lookup_pmid(self, pmid: str) -> Optional[str]: """Caches calls to the pmid lookup API endpoint in a local dict For identifier lookups only (not full object fetches)""" @@ -374,7 +384,7 @@ class EntityImporter: def is_issnl(self, issnl: str) -> bool: return len(issnl) == 9 and issnl[4] == "-" - def lookup_issnl(self, issnl: str): + def lookup_issnl(self, issnl: str) -> Optional[str]: """Caches calls to the ISSN-L lookup API endpoint in a local dict""" if issnl in self._issnl_id_map: return self._issnl_id_map[issnl] @@ -389,7 +399,7 @@ class EntityImporter: self._issnl_id_map[issnl] = container_id # might be None return container_id - def read_issn_map_file(self, issn_map_file): + def read_issn_map_file(self, issn_map_file: Sequence) -> None: print("Loading ISSN map file...", file=sys.stderr) self._issn_issnl_map = dict() for line in issn_map_file: @@ -407,7 +417,7 @@ class EntityImporter: return self._issn_issnl_map.get(issn) @staticmethod - def generic_file_cleanups(existing): + def generic_file_cleanups(existing: FileEntity) -> FileEntity: """ Conservative cleanup of existing file entities. @@ -453,7 +463,7 @@ class EntityImporter: return existing @staticmethod - def generic_fileset_cleanups(existing): + def generic_fileset_cleanups(existing: FilesetEntity) -> FilesetEntity: return existing def match_existing_release_fuzzy( @@ -520,10 +530,10 @@ class RecordPusher: wraps an importer and pushes records in to it. """ - def __init__(self, importer, **kwargs): + def __init__(self, importer: EntityImporter, **kwargs) -> None: self.importer = importer - def run(self): + def run(self) -> Counter: """ This will look something like: @@ -536,11 +546,11 @@ class RecordPusher: class JsonLinePusher(RecordPusher): - def __init__(self, importer, json_file, **kwargs): + def __init__(self, importer: EntityImporter, json_file: Sequence, **kwargs) -> None: self.importer = importer self.json_file = json_file - def run(self): + def run(self) -> Counter: for line in self.json_file: if not line: continue @@ -552,11 +562,11 @@ class JsonLinePusher(RecordPusher): class CsvPusher(RecordPusher): - def __init__(self, importer, csv_file, **kwargs): + def __init__(self, importer: EntityImporter, csv_file: Any, **kwargs) -> None: self.importer = importer self.reader = csv.DictReader(csv_file, delimiter=kwargs.get("delimiter", ",")) - def run(self): + def run(self) -> Counter: for line in self.reader: if not line: continue @@ -567,11 +577,11 @@ class CsvPusher(RecordPusher): class LinePusher(RecordPusher): - def __init__(self, importer, text_file, **kwargs): + def __init__(self, importer: EntityImporter, text_file: Sequence, **kwargs) -> None: self.importer = importer self.text_file = text_file - def run(self): + def run(self) -> Counter: for line in self.text_file: if not line: continue @@ -582,14 +592,21 @@ class LinePusher(RecordPusher): class SqlitePusher(RecordPusher): - def __init__(self, importer, db_file, table_name, where_clause="", **kwargs): + def __init__( + self, + importer: EntityImporter, + db_file: str, + table_name: str, + where_clause: str = "", + **kwargs + ) -> None: self.importer = importer self.db = sqlite3.connect(db_file, isolation_level="EXCLUSIVE") self.db.row_factory = sqlite3.Row self.table_name = table_name self.where_clause = where_clause - def run(self): + def run(self) -> Counter: cur = self.db.execute("SELECT * FROM {} {};".format(self.table_name, self.where_clause)) for row in cur: self.importer.push_record(row) @@ -599,12 +616,18 @@ class SqlitePusher(RecordPusher): class Bs4XmlLinesPusher(RecordPusher): - def __init__(self, importer, xml_file, prefix_filter=None, **kwargs): + def __init__( + self, + importer: EntityImporter, + xml_file: Sequence, + prefix_filter: Optional[str] = None, + **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.prefix_filter = prefix_filter - def run(self): + def run(self) -> Counter: for line in self.xml_file: if not line: continue @@ -619,12 +642,14 @@ class Bs4XmlLinesPusher(RecordPusher): class Bs4XmlFilePusher(RecordPusher): - def __init__(self, importer, xml_file, record_tag, **kwargs): + def __init__( + self, importer: EntityImporter, xml_file: Any, record_tag: str, **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.record_tag = record_tag - def run(self): + def run(self) -> Counter: soup = BeautifulSoup(self.xml_file, "xml") for record in soup.find_all(self.record_tag): self.importer.push_record(record) @@ -654,13 +679,20 @@ class Bs4XmlLargeFilePusher(RecordPusher): by inner container/release API lookup caches. """ - def __init__(self, importer, xml_file, record_tags, use_lxml=False, **kwargs): + def __init__( + self, + importer: EntityImporter, + xml_file: Any, + record_tags: List[str], + use_lxml: bool = False, + **kwargs + ) -> None: self.importer = importer self.xml_file = xml_file self.record_tags = record_tags self.use_lxml = use_lxml - def run(self): + def run(self) -> Counter: if self.use_lxml: elem_iter = lxml.etree.iterparse(self.xml_file, ["start", "end"], load_dtd=True) else: @@ -691,12 +723,14 @@ class Bs4XmlLargeFilePusher(RecordPusher): class Bs4XmlFileListPusher(RecordPusher): - def __init__(self, importer, list_file, record_tag, **kwargs): + def __init__( + self, importer: EntityImporter, list_file: Sequence, record_tag: str, **kwargs + ) -> None: self.importer = importer self.list_file = list_file self.record_tag = record_tag - def run(self): + def run(self) -> Counter: for xml_path in self.list_file: xml_path = xml_path.strip() if not xml_path or xml_path.startswith("#"): @@ -717,7 +751,15 @@ class KafkaBs4XmlPusher(RecordPusher): Fetch XML for an article from Kafka, parse via Bs4. """ - def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + def __init__( + self, + importer: EntityImporter, + kafka_hosts: str, + kafka_env: str, + topic_suffix: str, + group: str, + **kwargs + ) -> None: self.importer = importer self.consumer = make_kafka_consumer( kafka_hosts, @@ -729,7 +771,7 @@ class KafkaBs4XmlPusher(RecordPusher): self.poll_interval = kwargs.get("poll_interval", 5.0) self.consume_batch_size = kwargs.get("consume_batch_size", 25) - def run(self): + def run(self) -> Counter: count = 0 last_push = datetime.datetime.now() while True: @@ -784,7 +826,15 @@ class KafkaBs4XmlPusher(RecordPusher): class KafkaJsonPusher(RecordPusher): - def __init__(self, importer, kafka_hosts, kafka_env, topic_suffix, group, **kwargs): + def __init__( + self, + importer: EntityImporter, + kafka_hosts: str, + kafka_env: str, + topic_suffix: str, + group: str, + **kwargs + ) -> None: self.importer = importer self.consumer = make_kafka_consumer( kafka_hosts, @@ -797,7 +847,7 @@ class KafkaJsonPusher(RecordPusher): self.consume_batch_size = kwargs.get("consume_batch_size", 100) self.force_flush = kwargs.get("force_flush", False) - def run(self): + def run(self) -> Counter: count = 0 last_push = datetime.datetime.now() last_force_flush = datetime.datetime.now() @@ -862,10 +912,12 @@ class KafkaJsonPusher(RecordPusher): return counts -def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat"): +def make_kafka_consumer( + hosts: str, env: str, topic_suffix: str, group: str, kafka_namespace: str = "fatcat" +) -> Consumer: topic_name = "{}-{}.{}".format(kafka_namespace, env, topic_suffix) - def fail_fast(err, partitions): + def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") @@ -900,7 +952,7 @@ def make_kafka_consumer(hosts, env, topic_suffix, group, kafka_namespace="fatcat }, } - def on_rebalance(consumer, partitions): + def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) diff --git a/python/fatcat_tools/importers/crossref.py b/python/fatcat_tools/importers/crossref.py index d0017002..689989d2 100644 --- a/python/fatcat_tools/importers/crossref.py +++ b/python/fatcat_tools/importers/crossref.py @@ -1,9 +1,9 @@ import datetime import sqlite3 -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from .common import EntityImporter, clean @@ -90,7 +90,7 @@ LICENSE_SLUG_MAP: Dict[str, str] = { } -def lookup_license_slug(raw: str) -> Optional[str]: +def lookup_license_slug(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip().replace("http://", "//").replace("https://", "//") @@ -102,7 +102,7 @@ def lookup_license_slug(raw: str) -> Optional[str]: return LICENSE_SLUG_MAP.get(raw) -def test_lookup_license_slug(): +def test_lookup_license_slug() -> None: assert lookup_license_slug("https://creativecommons.org/licenses/by-nc/3.0/") == "CC-BY-NC" assert ( @@ -133,13 +133,13 @@ class CrossrefImporter(EntityImporter): See https://github.com/CrossRef/rest-api-doc for JSON schema notes """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc: Optional[str] = kwargs.get( "editgroup_description", "Automated import of Crossref DOI metadata, harvested from REST API", ) - eg_extra: Optional[dict] = kwargs.get("editgroup_extra", dict()) + eg_extra: Dict[str, Any] = kwargs.get("editgroup_extra", dict()) eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.CrossrefImporter") super().__init__( api, @@ -249,7 +249,7 @@ class CrossrefImporter(EntityImporter): release_type = self.map_release_type(obj["type"]) # contribs - def do_contribs(obj_list, ctype): + def do_contribs(obj_list: List[Dict[str, Any]], ctype: str) -> List[ReleaseContrib]: contribs = [] for i, am in enumerate(obj_list): creator_id = None @@ -257,15 +257,15 @@ class CrossrefImporter(EntityImporter): creator_id = self.lookup_orcid(am["ORCID"].split("/")[-1]) # Sorry humans :( if am.get("given") and am.get("family"): - raw_name = "{} {}".format(am["given"], am["family"]) + raw_name: Optional[str] = "{} {}".format(am["given"], am["family"]) elif am.get("family"): raw_name = am["family"] else: # TODO: can end up empty raw_name = am.get("name") or am.get("given") - extra = dict() + extra: Dict[str, Any] = dict() if ctype == "author": - index = i + index: Optional[int] = i else: index = None raw_affiliation = None @@ -284,7 +284,7 @@ class CrossrefImporter(EntityImporter): assert ctype in ("author", "editor", "translator") raw_name = clean(raw_name) contribs.append( - fatcat_openapi_client.ReleaseContrib( + ReleaseContrib( creator_id=creator_id, index=index, raw_name=raw_name, @@ -559,7 +559,7 @@ class CrossrefImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing DOI (don't need to try other ext idents for crossref) existing = None @@ -577,7 +577,7 @@ class CrossrefImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 4c174b0b..7cc5fa20 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -14,11 +14,13 @@ import json import re import sqlite3 import sys +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple import dateparser import fatcat_openapi_client import langdetect import pycountry +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict @@ -29,7 +31,7 @@ from .common import EntityImporter, clean MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary -CONTAINER_TYPE_MAP = { +CONTAINER_TYPE_MAP: Dict[str, str] = { "Journal": "journal", "Series": "journal", "Book Series": "book-series", @@ -38,7 +40,7 @@ CONTAINER_TYPE_MAP = { # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. -DATACITE_TYPE_MAP = { +DATACITE_TYPE_MAP: Dict[str, Dict[str, Optional[str]]] = { "ris": { "THES": "thesis", "SOUND": "song", # 99.9% maps to citeproc song, so use that (exception: report) @@ -128,7 +130,7 @@ DATACITE_TYPE_MAP = { } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. -DATACITE_UNKNOWN_MARKERS = ( +DATACITE_UNKNOWN_MARKERS: List[str] = [ "(:unac)", # temporarily inaccessible "(:unal)", # unallowed, suppressed intentionally "(:unap)", # not applicable, makes no sense @@ -139,11 +141,11 @@ DATACITE_UNKNOWN_MARKERS = ( "(:null)", # explicitly and meaningfully empty "(:tba)", # to be assigned or announced later "(:etal)", # too numerous to list (et alia) -) +] # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. -UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( +UNKNOWN_MARKERS: Set[str] = set(DATACITE_UNKNOWN_MARKERS).union( set( ( "NA", @@ -159,7 +161,7 @@ UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union( UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi -DATACITE_TITLE_SPAM_WORDGROUPS = [ +DATACITE_TITLE_SPAM_WORDGROUPS: List[Dict[str, Any]] = [ { "tokens": ( "full", @@ -180,7 +182,7 @@ DATACITE_TITLE_SPAM_WORDGROUPS = [ ] # TODO(martin): merge this with other maps and lookup functions, eventually. -LICENSE_SLUG_MAP = { +LICENSE_SLUG_MAP: Dict[str, str] = { "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", @@ -222,7 +224,14 @@ class DataciteImporter(EntityImporter): Importer for datacite records. """ - def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): + def __init__( + self, + api: ApiClient, + issn_map_file: Sequence, + debug: bool = False, + insert_log_file: bool = None, + **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -255,7 +264,7 @@ class DataciteImporter(EntityImporter): print("datacite with debug={}".format(self.debug), file=sys.stderr) - def lookup_ext_ids(self, doi): + def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: """ Return dictionary of identifiers referring to the same things as the given DOI. """ @@ -291,7 +300,7 @@ class DataciteImporter(EntityImporter): jstor_id=None, ) - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ Mapping datacite JSON to ReleaseEntity. """ @@ -413,7 +422,7 @@ class DataciteImporter(EntityImporter): # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". - release_stage = "published" + release_stage: Optional[str] = "published" # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: @@ -628,7 +637,7 @@ class DataciteImporter(EntityImporter): release_type = "review" # Extra information. - extra_datacite = dict() + extra_datacite: Dict[str, Any] = dict() if license_extra: extra_datacite["license"] = license_extra @@ -675,7 +684,7 @@ class DataciteImporter(EntityImporter): if relations: extra_datacite["relations"] = relations - extra = dict() + extra: Dict[str, Any] = dict() # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", @@ -734,7 +743,7 @@ class DataciteImporter(EntityImporter): return re @staticmethod - def datacite_release_type(doi, attributes): + def datacite_release_type(doi: str, attributes: Dict[str, Any]) -> Optional[str]: """ Release type. Try to determine the release type from a variety of types supplied in datacite. The "attributes.types.resourceType" is @@ -766,7 +775,7 @@ class DataciteImporter(EntityImporter): return release_type @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. @@ -817,7 +826,7 @@ class DataciteImporter(EntityImporter): return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: """ When debug is true, write the RE to stdout, not to the database. Might hide schema mismatch bugs. @@ -842,7 +851,7 @@ class DataciteImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: print("inserting batch ({})".format(len(batch)), file=sys.stderr) if self.insert_log_file: with open(self.insert_log_file, "a") as f: @@ -858,7 +867,13 @@ class DataciteImporter(EntityImporter): ) ) - def parse_datacite_creators(self, creators, role="author", set_index=True, doi=None): + def parse_datacite_creators( + self, + creators: List[Dict[str, Any]], + role: str = "author", + set_index: bool = True, + doi: Optional[str] = None, + ) -> List[ReleaseContrib]: """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. @@ -868,12 +883,12 @@ class DataciteImporter(EntityImporter): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. - contribs = [] + contribs: List[ReleaseContrib] = [] # Names, that should be ignored right away. name_blocklist = set(("Occdownload Gbif.Org",)) - i = 0 + i: Optional[int] = 0 for c in creators: if not set_index: i = None @@ -983,7 +998,9 @@ class DataciteImporter(EntityImporter): return contribs -def contributor_list_contains_contributor(contributor_list, contributor): +def contributor_list_contains_contributor( + contributor_list: ReleaseContrib, contributor: ReleaseContrib +) -> bool: """ Given a list of contributors, determine, whether contrib is in that list. """ @@ -998,7 +1015,7 @@ def contributor_list_contains_contributor(contributor_list, contributor): return False -def lookup_license_slug(raw): +def lookup_license_slug(raw: Optional[str]) -> Optional[str]: """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. @@ -1101,7 +1118,9 @@ def lookup_license_slug(raw): return LICENSE_SLUG_MAP.get(raw) -def find_original_language_title(item, min_length=4, max_questionmarks=3): +def find_original_language_title( + item: Dict[str, Any], min_length: int = 4, max_questionmarks: int = 3 +) -> Optional[str]: """ Perform a few checks before returning a potential original language title. @@ -1126,7 +1145,9 @@ def find_original_language_title(item, min_length=4, max_questionmarks=3): return None -def parse_datacite_titles(titles): +def parse_datacite_titles( + titles: List[Dict[str, Any]] +) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ Given a list of title items from datacite, return 3-tuple (title, original_language_title, subtitle). @@ -1158,7 +1179,9 @@ def parse_datacite_titles(titles): return title, original_language_title, subtitle -def parse_single_date(value): +def parse_single_date( + value: Optional[str], +) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: """ Given a single string containing a date in arbitrary format, try to return tuple (date: datetime.date, month: int, year: int). @@ -1186,10 +1209,12 @@ def parse_single_date(value): return None, None, None -def parse_datacite_dates(dates): +def parse_datacite_dates( + dates: List[Dict[str, Any]], +) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: """ Given a list of date fields (under .dates), return tuple, (release_date, - release_year). + release_month, release_year). """ release_date, release_month, release_year = None, None, None @@ -1226,9 +1251,13 @@ def parse_datacite_dates(dates): Pattern("%Y", "y"), ) - def parse_item(item): + def parse_item( + item: Dict[str, Any] + ) -> Tuple[Optional[datetime.date], Optional[int], Optional[int]]: result, value, year_only = None, str(item.get("date", "")) or "", False - release_date, release_month, release_year = None, None, None + release_date: Optional[datetime.date] = None + release_month: Optional[int] = None + release_year: Optional[int] = None for layout, granularity in common_patterns: try: @@ -1285,7 +1314,7 @@ def parse_datacite_dates(dates): return release_date, release_month, release_year -def index_form_to_display_name(s): +def index_form_to_display_name(s: str) -> str: """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. diff --git a/python/fatcat_tools/importers/dblp_container.py b/python/fatcat_tools/importers/dblp_container.py index 603a6271..36fe5f00 100644 --- a/python/fatcat_tools/importers/dblp_container.py +++ b/python/fatcat_tools/importers/dblp_container.py @@ -4,8 +4,10 @@ pre-scraped in to JSON from HTML pages. """ import sys # noqa: F401 +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import clean_str @@ -13,8 +15,13 @@ from fatcat_tools.normal import clean_str class DblpContainerImporter(EntityImporter): def __init__( - self, api, issn_map_file, dblp_container_map_file, dblp_container_map_output, **kwargs - ): + self, + api: ApiClient, + issn_map_file: Sequence, + dblp_container_map_file: Sequence, + dblp_container_map_output: Any, + **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -29,7 +36,7 @@ class DblpContainerImporter(EntityImporter): self.read_issn_map_file(issn_map_file) print("\t".join(["dblp_prefix", "container_id"]), file=self.dblp_container_map_output) - def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + def read_dblp_container_map_file(self, dblp_container_map_file: Sequence) -> None: self._dblp_container_map = dict() print("Loading existing dblp prefix container map file...", file=sys.stderr) for line in dblp_container_map_file: @@ -44,15 +51,15 @@ class DblpContainerImporter(EntityImporter): file=sys.stderr, ) - def lookup_dblp_prefix(self, prefix): + def lookup_dblp_prefix(self, prefix: str) -> Optional[str]: if not prefix: return None return self._dblp_container_map.get(prefix) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -77,7 +84,7 @@ class DblpContainerImporter(EntityImporter): if issnl: break - extra = { + extra: Dict[str, Any] = { "dblp": { "prefix": dblp_prefix, }, @@ -98,7 +105,7 @@ class DblpContainerImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: dblp_prefix = ce.extra["dblp"]["prefix"] existing = None @@ -135,7 +142,7 @@ class DblpContainerImporter(EntityImporter): # shouldn't get here raise NotImplementedError() - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: """ Because we want to print a prefix/container_id match for each row, we require a special batch insert method diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index e73e5f33..cb56432a 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -24,10 +24,11 @@ import datetime import json import sys # noqa: F401 import warnings -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional, Sequence import bs4 import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import ( @@ -44,7 +45,9 @@ from fatcat_tools.transforms import entity_to_dict class DblpReleaseImporter(EntityImporter): - def __init__(self, api, dblp_container_map_file=None, **kwargs): + def __init__( + self, api: ApiClient, dblp_container_map_file: Optional[Sequence] = None, **kwargs + ) -> None: eg_desc = kwargs.get( "editgroup_description", "Automated import of dblp metadata via XML records" @@ -70,7 +73,7 @@ class DblpReleaseImporter(EntityImporter): # "data", # no instances in 2020-11 dump ] - def read_dblp_container_map_file(self, dblp_container_map_file) -> None: + def read_dblp_container_map_file(self, dblp_container_map_file: Optional[Sequence]) -> None: self._dblp_container_map = dict() if not dblp_container_map_file: print( @@ -91,12 +94,12 @@ class DblpReleaseImporter(EntityImporter): file=sys.stderr, ) - def lookup_dblp_prefix(self, prefix): + def lookup_dblp_prefix(self, prefix: Optional[str]) -> Optional[str]: if not prefix: return None return self._dblp_container_map.get(prefix) - def want(self, xml_elem): + def want(self, xml_elem: Any) -> bool: if xml_elem.name not in self.ELEMENT_TYPES: self.counts["skip-type"] += 1 return False @@ -108,7 +111,8 @@ class DblpReleaseImporter(EntityImporter): return False return True - def parse_record(self, xml_elem): + # TODO: xml_elem could be typed instead of 'Any' for better type checking + def parse_record(self, xml_elem: Any) -> Optional[ReleaseEntity]: """ - title => may contain <i>, <sub>, <sup>, <tt> @@ -255,7 +259,7 @@ class DblpReleaseImporter(EntityImporter): dblp_extra["part_of_key"] = part_of_key # generic extra - extra = dict() + extra: Dict[str, Any] = dict() if not container_id and container_name: extra["container_name"] = container_name @@ -312,14 +316,14 @@ class DblpReleaseImporter(EntityImporter): return re @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing release by dblp article id existing = None @@ -411,7 +415,7 @@ class DblpReleaseImporter(EntityImporter): return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/doaj_article.py b/python/fatcat_tools/importers/doaj_article.py index 56045ea7..9ff4f3fb 100644 --- a/python/fatcat_tools/importers/doaj_article.py +++ b/python/fatcat_tools/importers/doaj_article.py @@ -6,9 +6,10 @@ DOAJ API schema and docs: https://doaj.org/api/v1/docs import datetime import warnings -from typing import List, Optional +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.importers.common import EntityImporter from fatcat_tools.normal import ( @@ -28,7 +29,7 @@ MAX_ABSTRACT_LENGTH = 2048 class DoajArticleImporter(EntityImporter): - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -49,10 +50,10 @@ class DoajArticleImporter(EntityImporter): self.this_year = datetime.datetime.now().year self.read_issn_map_file(issn_map_file) - def want(self, obj): + def want(self, raw_record: Dict[str, Any]) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: """ bibjson { abstract (string, optional), @@ -108,7 +109,7 @@ class DoajArticleImporter(EntityImporter): publisher = clean_str(bibjson["journal"].get("publisher")) try: - release_year = int(bibjson.get("year")) + release_year: Optional[int] = int(bibjson.get("year")) except (TypeError, ValueError): release_year = None release_month = parse_month(clean_str(bibjson.get("month"))) @@ -148,7 +149,7 @@ class DoajArticleImporter(EntityImporter): contribs = self.doaj_contribs(bibjson.get("author") or []) # DOAJ-specific extra - doaj_extra = dict() + doaj_extra: Dict[str, Any] = dict() if bibjson.get("subject"): doaj_extra["subject"] = bibjson.get("subject") if bibjson.get("keywords"): @@ -157,7 +158,7 @@ class DoajArticleImporter(EntityImporter): ] # generic extra - extra = dict() + extra: Dict[str, Any] = dict() if country: extra["country"] = country if not container_id and container_name: @@ -194,14 +195,14 @@ class DoajArticleImporter(EntityImporter): return re @staticmethod - def biblio_hacks(re): + def biblio_hacks(re: ReleaseEntity) -> ReleaseEntity: """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing release by DOAJ article id existing = None @@ -276,7 +277,7 @@ class DoajArticleImporter(EntityImporter): return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/file_meta.py b/python/fatcat_tools/importers/file_meta.py index 26584ff3..892c1dcd 100644 --- a/python/fatcat_tools/importers/file_meta.py +++ b/python/fatcat_tools/importers/file_meta.py @@ -1,4 +1,7 @@ +from typing import Any, Dict + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from .common import EntityImporter @@ -14,7 +17,7 @@ class FileMetaImporter(EntityImporter): imported which were missing file size, mimetype, md5, and/or sha256. """ - def __init__(self, api, require_grobid=True, **kwargs): + def __init__(self, api: ApiClient, require_grobid: bool = True, **kwargs): eg_desc = kwargs.pop("editgroup_description", None) or "File metadata updates" eg_extra = kwargs.pop("editgroup_extra", dict()) @@ -22,14 +25,14 @@ class FileMetaImporter(EntityImporter): kwargs["do_updates"] = kwargs.get("do_updates", True) super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Any) -> bool: for k in ("sha1hex", "sha256hex", "md5hex", "size_bytes", "mimetype"): if not row.get(k): self.counts["skip-missing-field"] += 1 return False return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FileEntity: # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False @@ -44,7 +47,7 @@ class FileMetaImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None diff --git a/python/fatcat_tools/importers/fileset_generic.py b/python/fatcat_tools/importers/fileset_generic.py index dd8f5600..2207b938 100644 --- a/python/fatcat_tools/importers/fileset_generic.py +++ b/python/fatcat_tools/importers/fileset_generic.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FilesetEntity from fatcat_tools import entity_from_dict @@ -17,7 +20,7 @@ class FilesetImporter(EntityImporter): Currently only creates (insert), no updates. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.pop("editgroup_description", None) or "Generic Fileset entity import" eg_extra = kwargs.pop("editgroup_extra", dict()) @@ -29,7 +32,7 @@ class FilesetImporter(EntityImporter): # bezerk mode doesn't make sense for this importer assert self.bezerk_mode is False - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not row.get("release_ids"): self.counts["skip-no-release-ids"] += 1 return False @@ -47,7 +50,7 @@ class FilesetImporter(EntityImporter): return False return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[FilesetEntity]: fse = entity_from_dict( row, @@ -57,7 +60,7 @@ class FilesetImporter(EntityImporter): fse = self.generic_fileset_cleanups(fse) return fse - def try_update(self, fse): + def try_update(self, fse: FilesetEntity) -> bool: if not self.skip_release_fileset_check: for release_id in fse.release_ids: @@ -74,7 +77,7 @@ class FilesetImporter(EntityImporter): # do the insert return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FilesetEntity]) -> None: self.api.create_fileset_auto_batch( fatcat_openapi_client.FilesetAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/grobid_metadata.py b/python/fatcat_tools/importers/grobid_metadata.py index f7bb5357..830c9bbb 100644 --- a/python/fatcat_tools/importers/grobid_metadata.py +++ b/python/fatcat_tools/importers/grobid_metadata.py @@ -2,8 +2,10 @@ import base64 import json +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity, ReleaseEntity from .common import EntityImporter, clean, make_rel_url @@ -22,7 +24,7 @@ class GrobidMetadataImporter(EntityImporter): TODO: relaxing 'None' constraint on parse_record() might make this refactor-able. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -34,10 +36,10 @@ class GrobidMetadataImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") self.longtail_oa = kwargs.get("longtail_oa", False) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, row): + def parse_record(self, row: str) -> Optional[FileEntity]: fields = row.split("\t") sha1_key = fields[0] @@ -72,12 +74,12 @@ class GrobidMetadataImporter(EntityImporter): fe.release_ids.append(release_edit.ident) return fe - def parse_grobid_json(self, obj): + def parse_grobid_json(self, obj: Dict[str, Any]) -> Optional[ReleaseEntity]: if not obj.get("title"): return None - extra_grobid = dict() + extra_grobid: Dict[str, Any] = dict() abstract = obj.get("abstract") if abstract and len(abstract) < MAX_ABSTRACT_BYTES and len(abstract) > 10: @@ -103,7 +105,7 @@ class GrobidMetadataImporter(EntityImporter): refs = [] for raw in obj.get("citations", []): - cite_extra = dict() + cite_extra: Dict[str, Any] = dict() year = None if raw.get("date"): try: @@ -162,13 +164,15 @@ class GrobidMetadataImporter(EntityImporter): publisher=clean(obj["journal"].get("publisher")), volume=clean(obj["journal"].get("volume")), issue=clean(obj["journal"].get("issue")), - abstracts=abstracts, + abstracts=abstracts or None, ext_ids=fatcat_openapi_client.ReleaseExtIds(), - extra=extra, + extra=extra or None, ) return re - def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): + def parse_file_metadata( + self, sha1_key: str, cdx: Dict[str, Any], mimetype: str, file_size: int + ) -> FileEntity: sha1 = ( base64.b16encode(base64.b32decode(sha1_key.replace("sha1:", ""))) @@ -197,11 +201,11 @@ class GrobidMetadataImporter(EntityImporter): return fe - def try_update(self, entity): + def try_update(self, re: FileEntity) -> bool: # did the exists check in 'parse_record()', because we needed to create a release return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index e0a6c3f5..e13ce4bd 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -1,12 +1,23 @@ import datetime +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ( + ApiClient, + FileEntity, + FilesetEntity, + FilesetUrl, + FileUrl, + WebcaptureEntity, +) from .common import EntityImporter, make_rel_url class IngestFileResultImporter(EntityImporter): - def __init__(self, api, require_grobid=True, **kwargs): + def __init__( + self, api: fatcat_openapi_client.ApiClient, require_grobid: bool = True, **kwargs + ) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -41,7 +52,7 @@ class IngestFileResultImporter(EntityImporter): if kwargs.get("skip_source_allowlist", False): self.ingest_request_source_allowlist = [] - def want_file(self, row) -> bool: + def want_file(self, row: Dict[str, Any]) -> bool: """ File-specific part of want(). Generic across general ingest and save-paper-now. """ @@ -76,7 +87,7 @@ class IngestFileResultImporter(EntityImporter): return True - def want_ingest(self, row) -> bool: + def want_ingest(self, row: Dict[str, Any]) -> bool: """ Sandcrawler ingest-specific part of want(). Generic across file and webcapture ingest. @@ -115,7 +126,7 @@ class IngestFileResultImporter(EntityImporter): return True - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: """ Overall logic here probably needs work (TODO): @@ -137,7 +148,7 @@ class IngestFileResultImporter(EntityImporter): return True - def parse_ingest_release_ident(self, row): + def parse_ingest_release_ident(self, row: Dict[str, Any]) -> Optional[str]: request = row["request"] fatcat = request.get("fatcat") @@ -178,7 +189,7 @@ class IngestFileResultImporter(EntityImporter): return release_ident - def parse_terminal(self, row): + def parse_terminal(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: terminal = row.get("terminal") if not terminal: # support old cdx-only ingest results @@ -206,7 +217,7 @@ class IngestFileResultImporter(EntityImporter): ) return terminal - def parse_urls(self, row, terminal): + def parse_urls(self, row: Dict[str, Any], terminal: Dict[str, Any]) -> List[FileUrl]: request = row["request"] @@ -224,10 +235,10 @@ class IngestFileResultImporter(EntityImporter): ) urls = [url, ("webarchive", wayback)] - urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] + urls = [FileUrl(rel=rel, url=url) for (rel, url) in urls] return urls - def parse_edit_extra(self, row): + def parse_edit_extra(self, row: Dict[str, Any]) -> Dict[str, Any]: request = row["request"] edit_extra = dict() @@ -251,7 +262,7 @@ class IngestFileResultImporter(EntityImporter): return edit_extra - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FileEntity: request = row["request"] file_meta = row["file_meta"] @@ -283,7 +294,7 @@ class IngestFileResultImporter(EntityImporter): urls = self.parse_urls(row, terminal) - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=file_meta["md5hex"], sha1=file_meta["sha1hex"], sha256=file_meta["sha256hex"], @@ -298,7 +309,7 @@ class IngestFileResultImporter(EntityImporter): fe.edit_extra = edit_extra return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -330,7 +341,7 @@ class IngestFileResultImporter(EntityImporter): self.counts["skip-update-disabled"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -358,7 +369,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): them for further human review (as opposed to accepting by default). """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -371,7 +382,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: source = row["request"].get("ingest_request_source") if not source: @@ -397,7 +408,7 @@ class IngestWebResultImporter(IngestFileResultImporter): into webcapture objects. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -408,7 +419,7 @@ class IngestWebResultImporter(IngestFileResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not self.want_ingest(row): return False @@ -426,7 +437,7 @@ class IngestWebResultImporter(IngestFileResultImporter): return True - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[WebcaptureEntity]: request = row["request"] file_meta = row["file_meta"] @@ -512,7 +523,7 @@ class IngestWebResultImporter(IngestFileResultImporter): wc.edit_extra = edit_extra return wc - def try_update(self, wc): + def try_update(self, wc: WebcaptureEntity) -> bool: # check for existing edits-in-progress with same URL for other in self._entity_queue: @@ -539,7 +550,7 @@ class IngestWebResultImporter(IngestFileResultImporter): # so go ahead and insert! return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[WebcaptureEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -565,7 +576,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter): Like SavePaperNowFileImporter, but for webcapture (HTML) ingest. """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -577,7 +588,7 @@ class SavePaperNowWebImporter(IngestWebResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: """ Relatively custom want() here, a synthesis of other filters. @@ -617,7 +628,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): results into fileset objects. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -629,7 +640,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.max_file_count = 300 - def want_fileset(self, row): + def want_fileset(self, row: Dict[str, Any]) -> bool: if not row.get("manifest") or len(row.get("manifest")) == 0: self.counts["skip-empty-manifest"] += 1 @@ -645,7 +656,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: if not self.want_ingest(row): return False @@ -662,7 +673,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def parse_fileset_urls(self, row): + def parse_fileset_urls(self, row: Dict[str, Any]) -> List[FilesetUrl]: if not row.get("strategy"): return [] strategy = row["strategy"] @@ -717,7 +728,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): ) return urls - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> FilesetEntity: request = row["request"] @@ -735,7 +746,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): self.counts["skip-release-not-found"] += 1 return None - entity_extra = dict() + entity_extra: Dict[str, Any] = dict() edit_extra = self.parse_edit_extra(row) edit_extra["ingest_strategy"] = row["ingest_strategy"] if row.get("platform"): @@ -789,12 +800,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter): fe.edit_extra = edit_extra return fe - def try_update(self, wc): + def try_update(self, fse: FilesetEntity) -> bool: # check for existing edits-in-progress with same URL for other in self._entity_queue: # XXX: how to duplicate check? - if other.original_url == wc.original_url: + if other.original_url == fse.original_url: self.counts["skip-in-queue"] += 1 return False @@ -802,12 +813,12 @@ class IngestFilesetResultImporter(IngestFileResultImporter): # existing = None # NOTE: in lieu of existing checks (by lookup), only allow one fileset per release - release = self.api.get_release(wc.release_ids[0], expand="filesets") + release = self.api.get_release(fse.release_ids[0], expand="filesets") if release.filesets: # XXX: how to duplicate check filesets? # check if this is an existing match, or just a similar hit for other in release.filesets: - if wc.original_url == other.original_url: + if fse.original_url == other.original_url: # TODO: compare very similar timestamps of same time (different formats) self.counts["exists"] += 1 return False @@ -816,7 +827,7 @@ class IngestFilesetResultImporter(IngestFileResultImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[FilesetEntity]) -> None: if self.submit_mode: eg = self.api.create_editgroup( fatcat_openapi_client.Editgroup( @@ -842,7 +853,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): Like SavePaperNowFileImporter, but for fileset/dataset ingest. """ - def __init__(self, api, submit_mode=True, **kwargs): + def __init__(self, api: ApiClient, submit_mode: bool = True, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -854,7 +865,7 @@ class SavePaperNowFilesetImporter(IngestFilesetResultImporter): kwargs["do_updates"] = False super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, row): + def want(self, row: Dict[str, Any]) -> bool: source = row["request"].get("ingest_request_source") if not source: diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index a7e06e6a..f540c264 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -1,16 +1,19 @@ import datetime import sqlite3 import sys +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseContrib, ReleaseEntity from fatcat_tools.normal import clean_doi from .common import DATE_FMT, EntityImporter, clean, is_cjk -def parse_jalc_persons(raw_persons): +# TODO: should be List[Tag] not List[Any] for full type annotations +def parse_jalc_persons(raw_persons: List[Any]) -> List[ReleaseContrib]: """ For the most part, JALC DC names are in either japanese or english. The two common patterns are a list alternating between the two (in which case @@ -47,7 +50,7 @@ def parse_jalc_persons(raw_persons): if lang == "en" and surname and given_name: # english names order is flipped name = "{} {}".format(given_name, surname) - rc = fatcat_openapi_client.ReleaseContrib( + rc = ReleaseContrib( raw_name=name, surname=surname, given_name=given_name, role="author" ) # add an extra hint field; won't end up in serialized object @@ -100,7 +103,7 @@ class JalcImporter(EntityImporter): NOTE: some JALC DOIs seem to get cross-registered with Crossref """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get("editgroup_description", "Automated import of JALC DOI metadata") eg_extra = kwargs.get("editgroup_extra", dict()) @@ -125,7 +128,7 @@ class JalcImporter(EntityImporter): self.read_issn_map_file(issn_map_file) - def lookup_ext_ids(self, doi): + def lookup_ext_ids(self, doi: str) -> Dict[str, Any]: if self.extid_map_db is None: return dict( core_id=None, @@ -158,10 +161,12 @@ class JalcImporter(EntityImporter): jstor_id=None, ) - def want(self, obj): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, record): + # TODO: mypy annotations partially skipped on this function ('Any' instead of 'Tag') + # for now because # XML # parsing # annotations are large and complex + def parse_record(self, record: Any) -> Optional[ReleaseEntity]: """ record is a beautiful soup object returns a ReleaseEntity, or None @@ -170,8 +175,8 @@ class JalcImporter(EntityImporter): fields. """ - extra = dict() - extra_jalc = dict() + extra: Dict[str, Any] = dict() + extra_jalc: Dict[str, Any] = dict() titles = record.find_all("title") if not titles: @@ -254,7 +259,7 @@ class JalcImporter(EntityImporter): publisher = None container_name = None - container_extra = dict() + container_extra: Dict[str, Any] = dict() if record.publicationName: pubs = [ @@ -335,7 +340,7 @@ class JalcImporter(EntityImporter): if not title: return None - re = fatcat_openapi_client.ReleaseEntity( + re = ReleaseEntity( work_id=None, title=title, original_title=clean(original_title), @@ -364,7 +369,7 @@ class JalcImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # lookup existing DOI existing = None @@ -384,7 +389,7 @@ class JalcImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -394,7 +399,7 @@ class JalcImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: """ Helper for testing; can run this file stand-alone instead of using a pusher """ @@ -408,4 +413,3 @@ class JalcImporter(EntityImporter): # print(json.dumps(resp)) print(resp) # sys.exit(-1) - diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 6d1fefa3..a45e49f3 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -1,9 +1,12 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, ContainerEntity from .common import EntityImporter, clean -def or_none(s): +def or_none(s: Optional[str]) -> Optional[str]: if s is None: return None if len(s) == 0: @@ -11,7 +14,7 @@ def or_none(s): return s -def truthy(s): +def truthy(s: Optional[str]) -> Optional[bool]: if s is None: return None s = s.lower() @@ -32,7 +35,7 @@ class JournalMetadataImporter(EntityImporter): See guide for details on the many 'extra' fields used here. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -42,12 +45,12 @@ class JournalMetadataImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.JournalMetadataImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: if raw_record.get("issnl") and raw_record.get("name"): return True return False - def parse_record(self, row): + def parse_record(self, row: Dict[str, Any]) -> Optional[ContainerEntity]: """ row is a python dict (parsed from JSON). @@ -106,7 +109,7 @@ class JournalMetadataImporter(EntityImporter): if not name: return None - ce = fatcat_openapi_client.ContainerEntity( + ce = ContainerEntity( issnl=row["issnl"], issne=row.get("issne"), issnp=row.get("issnp"), @@ -118,7 +121,7 @@ class JournalMetadataImporter(EntityImporter): ) return ce - def try_update(self, ce): + def try_update(self, ce: ContainerEntity) -> bool: existing = None try: @@ -148,7 +151,7 @@ class JournalMetadataImporter(EntityImporter): # if we got this far, it's a bug raise NotImplementedError - def insert_batch(self, batch): + def insert_batch(self, batch: List[ContainerEntity]) -> None: self.api.create_container_auto_batch( fatcat_openapi_client.ContainerAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 287fb308..0a6eec65 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -2,9 +2,11 @@ import datetime import json import sys import warnings +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from .common import LANG_MAP_MARC, EntityImporter, clean from .crossref import CONTAINER_TYPE_MAP @@ -32,7 +34,7 @@ class JstorImporter(EntityImporter): Collection) """ - def __init__(self, api, issn_map_file, **kwargs): + def __init__(self, api: ApiClient, issn_map_file: Sequence, **kwargs) -> None: eg_desc = kwargs.get("editgroup_description", "Automated import of JSTOR XML metadata") eg_extra = kwargs.get("editgroup_extra", dict()) @@ -49,19 +51,22 @@ class JstorImporter(EntityImporter): self.read_issn_map_file(issn_map_file) - def map_container_type(self, crossref_type): + def map_container_type(self, crossref_type: Optional[str]) -> Optional[str]: return CONTAINER_TYPE_MAP.get(crossref_type) - def want(self, obj): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, article): + # TODO: mypy annotations partially skipped on this function ('Any' instead of + # 'BeautifulSoup') for now because XML parsing annotations are large and + # complex + def parse_record(self, article: Any) -> Optional[ReleaseEntity]: journal_meta = article.front.find("journal-meta") article_meta = article.front.find("article-meta") - extra = dict() - extra_jstor = dict() + extra: Dict[str, Any] = dict() + extra_jstor: Dict[str, Any] = dict() release_type = JSTOR_TYPE_MAP.get(article["article-type"]) title = article_meta.find("article-title") @@ -269,7 +274,7 @@ class JstorImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # first, lookup existing by JSTOR id (which much be defined) existing = None @@ -313,7 +318,7 @@ class JstorImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -323,7 +328,7 @@ class JstorImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/matched.py b/python/fatcat_tools/importers/matched.py index 7c2a6a87..9c80dd72 100644 --- a/python/fatcat_tools/importers/matched.py +++ b/python/fatcat_tools/importers/matched.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from fatcat_tools.normal import clean_doi @@ -29,7 +32,7 @@ class MatchedImporter(EntityImporter): - core_id, wikidata_id, pmcid, pmid: not as lists """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -41,10 +44,10 @@ class MatchedImporter(EntityImporter): self.default_link_rel = kwargs.get("default_link_rel", "web") self.default_mimetype = kwargs.get("default_mimetype", None) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]: dois = [d.lower() for d in obj.get("dois", [])] # lookup dois @@ -129,7 +132,7 @@ class MatchedImporter(EntityImporter): if urls[0].url.endswith(".pdf"): mimetype = "application/pdf" - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=obj.get("md5"), sha1=obj["sha1"], sha256=obj.get("sha256"), @@ -140,7 +143,7 @@ class MatchedImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> bool: # lookup sha1, or create new entity existing = None try: @@ -207,7 +210,7 @@ class MatchedImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/orcid.py b/python/fatcat_tools/importers/orcid.py index b514e6e5..430cdd0f 100644 --- a/python/fatcat_tools/importers/orcid.py +++ b/python/fatcat_tools/importers/orcid.py @@ -1,11 +1,13 @@ import sys +from typing import Any, Dict, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, CreatorEntity from .common import EntityImporter, clean -def value_or_none(e): +def value_or_none(e: Any) -> Any: if type(e) == dict: e = e.get("value") if type(e) == str and len(e) == 0: @@ -22,7 +24,7 @@ def value_or_none(e): class OrcidImporter(EntityImporter): - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = kwargs.get( "editgroup_description", @@ -32,10 +34,10 @@ class OrcidImporter(EntityImporter): eg_extra["agent"] = eg_extra.get("agent", "fatcat_tools.OrcidImporter") super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[CreatorEntity]: """ obj is a python dict (parsed from json). returns a CreatorEntity @@ -67,7 +69,7 @@ class OrcidImporter(EntityImporter): if not display: # must have *some* name return None - ce = fatcat_openapi_client.CreatorEntity( + ce = CreatorEntity( orcid=orcid, given_name=clean(given), surname=clean(sur), @@ -76,10 +78,10 @@ class OrcidImporter(EntityImporter): ) return ce - def try_update(self, raw_record): + def try_update(self, ce: CreatorEntity) -> bool: existing = None try: - existing = self.api.lookup_creator(orcid=raw_record.orcid) + existing = self.api.lookup_creator(orcid=ce.orcid) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err @@ -92,7 +94,7 @@ class OrcidImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[CreatorEntity]) -> None: self.api.create_creator_auto_batch( fatcat_openapi_client.CreatorAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 97433445..41268925 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -2,9 +2,11 @@ import datetime import json import sys import warnings +from typing import Any, Dict, List, Optional, Sequence import fatcat_openapi_client from bs4 import BeautifulSoup +from fatcat_openapi_client import ApiClient, ReleaseEntity from fatcat_tools.normal import clean_doi, clean_issn, clean_pmcid, clean_pmid @@ -328,7 +330,9 @@ class PubmedImporter(EntityImporter): TODO: MEDLINE doesn't include PMC/OA license; could include in importer? """ - def __init__(self, api, issn_map_file, lookup_refs=True, **kwargs): + def __init__( + self, api: ApiClient, issn_map_file: Sequence, lookup_refs: bool = True, **kwargs + ): eg_desc = kwargs.get( "editgroup_description", "Automated import of PubMed/MEDLINE XML metadata" @@ -347,10 +351,13 @@ class PubmedImporter(EntityImporter): self.create_containers = kwargs.get("create_containers", True) self.read_issn_map_file(issn_map_file) - def want(self, obj): + def want(self, raw_record: BeautifulSoup) -> bool: return True - def parse_record(self, a): + # TODO: mypy annotations partially skipped on this function ('Any' instead of + # 'BeautifulSoup') for now because XML parsing annotations are large and + # complex + def parse_record(self, a: Any) -> ReleaseEntity: medline = a.MedlineCitation # PubmedData isn't required by DTD, but seems to always be present @@ -482,8 +489,8 @@ class PubmedImporter(EntityImporter): pub_date = journal.PubDate if not pub_date: pub_date = journal.JournalIssue.PubDate - release_date = None - release_year = None + release_date: Optional[str] = None + release_year: Optional[int] = None if pub_date.Year: release_year = int(pub_date.Year.string) if pub_date.find("Day") and pub_date.find("Month"): @@ -578,7 +585,7 @@ class PubmedImporter(EntityImporter): abstracts.append(abst) other_abstracts = medline.find_all("OtherAbstract") for other in other_abstracts: - lang = "en" + lang: Optional[str] = "en" if other.get("Language"): lang = LANG_MAP_MARC.get(other["Language"]) abst = fatcat_openapi_client.ReleaseAbstract( @@ -666,7 +673,7 @@ class PubmedImporter(EntityImporter): # that there may be multiple ReferenceList (eg, sometimes one per # Reference) for ref in pubmed.find_all("Reference"): - ref_extra = dict() + ref_extra: Dict[str, Any] = dict() ref_doi = ref.find("ArticleId", IdType="doi") if ref_doi: ref_doi = clean_doi(ref_doi.string) @@ -740,7 +747,7 @@ class PubmedImporter(EntityImporter): ) return re - def try_update(self, re): + def try_update(self, re: ReleaseEntity) -> bool: # first, lookup existing by PMID (which must be defined) existing = None @@ -831,7 +838,7 @@ class PubmedImporter(EntityImporter): return True - def insert_batch(self, batch): + def insert_batch(self, batch: List[ReleaseEntity]) -> None: self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( @@ -841,7 +848,7 @@ class PubmedImporter(EntityImporter): ) ) - def parse_file(self, handle): + def parse_file(self, handle: Any) -> None: # 1. open with beautiful soup soup = BeautifulSoup(handle, "xml") diff --git a/python/fatcat_tools/importers/shadow.py b/python/fatcat_tools/importers/shadow.py index 78eeec7a..520258cb 100644 --- a/python/fatcat_tools/importers/shadow.py +++ b/python/fatcat_tools/importers/shadow.py @@ -1,4 +1,7 @@ +from typing import Any, Dict, List, Optional + import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, FileEntity from fatcat_tools.normal import clean_doi, clean_isbn13, clean_pmid @@ -27,7 +30,7 @@ class ShadowLibraryImporter(EntityImporter): - datetime """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs) -> None: eg_desc = ( kwargs.pop("editgroup_description", None) @@ -38,7 +41,7 @@ class ShadowLibraryImporter(EntityImporter): super().__init__(api, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") - def want(self, raw_record): + def want(self, raw_record: Any) -> bool: """ Only want to import records with complete file-level metadata """ @@ -51,7 +54,7 @@ class ShadowLibraryImporter(EntityImporter): return False return True - def parse_record(self, obj): + def parse_record(self, obj: Dict[str, Any]) -> Optional[FileEntity]: """ We do the release lookup in this method. Try DOI, then PMID, last ISBN13. """ @@ -104,7 +107,7 @@ class ShadowLibraryImporter(EntityImporter): urls.append(("webarchive", wayback)) urls = [fatcat_openapi_client.FileUrl(rel=rel, url=url) for (rel, url) in urls] - fe = fatcat_openapi_client.FileEntity( + fe = FileEntity( md5=obj["file_meta"]["md5hex"], sha1=obj["file_meta"]["sha1hex"], sha256=obj["file_meta"]["sha256hex"], @@ -116,7 +119,7 @@ class ShadowLibraryImporter(EntityImporter): ) return fe - def try_update(self, fe): + def try_update(self, fe: FileEntity) -> Optional[bool]: # lookup sha1, or create new entity existing = None try: @@ -189,7 +192,7 @@ class ShadowLibraryImporter(EntityImporter): self.counts["update"] += 1 return False - def insert_batch(self, batch): + def insert_batch(self, batch: List[FileEntity]) -> None: self.api.create_file_auto_batch( fatcat_openapi_client.FileAutoBatch( editgroup=fatcat_openapi_client.Editgroup( diff --git a/python/fatcat_tools/importers/wayback_static.py b/python/fatcat_tools/importers/wayback_static.py index 22fefad3..f9ee29c9 100755 --- a/python/fatcat_tools/importers/wayback_static.py +++ b/python/fatcat_tools/importers/wayback_static.py @@ -12,12 +12,14 @@ import hashlib import json import subprocess import sys +from typing import Any, Dict, List, Optional, Tuple import requests from bs4 import BeautifulSoup from fatcat_openapi_client import ( ApiClient, Editgroup, + EntityEdit, WebcaptureCdxLine, WebcaptureEntity, WebcaptureUrl, @@ -30,7 +32,7 @@ GWB_URL_BASE = "https://web.archive.org/web" REQ_SESSION = requests.Session() -def parse_wbm_url(url): +def parse_wbm_url(url: str) -> Tuple[str, datetime.datetime, str]: """Takes a wayback machine URL, and returns a tuple: (timestamp, datetime, original_url) @@ -42,7 +44,7 @@ def parse_wbm_url(url): return (chunks[4], parse_wbm_timestamp(chunks[4]), "/".join(chunks[5:])) -def test_parse_wbm_url(): +def test_parse_wbm_url() -> None: u = "http://web.archive.org/web/20010712114837/http://www.dlib.org/dlib/june01/reich/06reich.html" assert parse_wbm_url(u) == ( "20010712114837", @@ -51,7 +53,7 @@ def test_parse_wbm_url(): ) -def parse_wbm_timestamp(timestamp): +def parse_wbm_timestamp(timestamp: str) -> datetime.datetime: """ Takes a complete WBM timestamp string (like "20020327115625") and returns a python datetime object (UTC) @@ -71,18 +73,20 @@ def parse_wbm_timestamp(timestamp): ) -def test_parse_wbm_timestamp(): +def test_parse_wbm_timestamp() -> None: assert parse_wbm_timestamp("20010712114837") == datetime.datetime(2001, 7, 12, 11, 48, 37) -def fetch_wbm(url): +def fetch_wbm(url: str) -> bytes: resp = REQ_SESSION.get(url) resp.raise_for_status() assert resp.content return resp.content -def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): +def lookup_cdx( + embed_url: str, verify_hashes: bool = True, cdx_output: Any = None +) -> Optional[WebcaptureCdxLine]: sys.stderr.write(embed_url + "\n") assert embed_url.startswith("/web/") embed_url = embed_url.split("/") @@ -132,7 +136,7 @@ def lookup_cdx(embed_url, verify_hashes=True, cdx_output=None): return None -def wayback_url_to_relative(url): +def wayback_url_to_relative(url: str) -> Optional[str]: """ Wayback URLs can be relative or absolute in rewritten documents. This function converts any form of rewritten URL to a relative (to @@ -149,7 +153,7 @@ def wayback_url_to_relative(url): return None -def extract_embeds(soup): +def extract_embeds(soup: BeautifulSoup) -> List[str]: embeds = set() @@ -175,7 +179,7 @@ def extract_embeds(soup): return list(embeds) -def static_wayback_webcapture(wayback_url, cdx_output=None): +def static_wayback_webcapture(wayback_url: str, cdx_output: Any = None) -> WebcaptureEntity: """ Given a complete wayback machine capture URL, like: @@ -214,7 +218,9 @@ def static_wayback_webcapture(wayback_url, cdx_output=None): return wc -def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): +def auto_wayback_static( + api: ApiClient, release_id: str, wayback_url: str, editgroup_id: Optional[str] = None +) -> Tuple[Optional[str], Optional[EntityEdit]]: """ Returns a tuple: (editgroup_id, edit). If failed, both are None """ @@ -250,7 +256,7 @@ def auto_wayback_static(api, release_id, wayback_url, editgroup_id=None): return (editgroup_id, edit) -def main(): +def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--verbose", action="store_true", help="verbose output") parser.add_argument("wayback_url", type=str, help="URL of wayback capture to extract from") |