diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 14:16:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-03 16:46:07 -0700 |
commit | 45612465cc08114fa19588ea1b0aba0ace1e7b03 (patch) | |
tree | df1381b7b99a0be79ddbabc3c0cfae05b4986a47 /python/fatcat_tools/harvest/pubmed.py | |
parent | 924639f7faf8c2fc1a635c0f27a13d98b47fc453 (diff) | |
download | fatcat-45612465cc08114fa19588ea1b0aba0ace1e7b03.tar.gz fatcat-45612465cc08114fa19588ea1b0aba0ace1e7b03.zip |
typing: add annotations to remaining fatcat_tools code
Again, these are just annotations, no changes made to get type checks to
pass
Diffstat (limited to 'python/fatcat_tools/harvest/pubmed.py')
-rw-r--r-- | python/fatcat_tools/harvest/pubmed.py | 38 |
1 files changed, 26 insertions, 12 deletions
diff --git a/python/fatcat_tools/harvest/pubmed.py b/python/fatcat_tools/harvest/pubmed.py index 0f33f334..a1b4da0e 100644 --- a/python/fatcat_tools/harvest/pubmed.py +++ b/python/fatcat_tools/harvest/pubmed.py @@ -9,6 +9,7 @@ Assumptions: """ import collections +import datetime import ftplib import gzip import io @@ -22,6 +23,7 @@ import tempfile import time import xml.etree.ElementTree as ET import zlib +from typing import Any, Dict, Generator, Optional from urllib.parse import urlparse import dateparser @@ -61,7 +63,14 @@ class PubmedFTPWorker: """ - def __init__(self, kafka_hosts, produce_topic, state_topic, start_date=None, end_date=None): + def __init__( + self, + kafka_hosts: str, + produce_topic: str, + state_topic: str, + start_date: Optional[datetime.date] = None, + end_date: Optional[datetime.date] = None, + ): self.name = "Pubmed" self.host = "ftp.ncbi.nlm.nih.gov" self.produce_topic = produce_topic @@ -74,10 +83,10 @@ class PubmedFTPWorker: self.state = HarvestState(start_date, end_date) self.state.initialize_from_kafka(self.state_topic, self.kafka_config) self.producer = self._kafka_producer() - self.date_file_map = None + self.date_file_map: Optional[Dict[str, Any]] = None - def _kafka_producer(self): - def fail_fast(err, msg): + def _kafka_producer(self) -> Producer: + def fail_fast(err: Any, _msg: None) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) @@ -97,7 +106,7 @@ class PubmedFTPWorker: ) return Producer(producer_conf) - def fetch_date(self, date): + def fetch_date(self, date: datetime.date) -> bool: """ Fetch file or files for a given date and feed Kafka one article per message. If the fetched XML does not contain a PMID an exception is @@ -163,7 +172,7 @@ class PubmedFTPWorker: return True - def run(self, continuous=False): + def run(self, continuous: bool = False) -> None: while True: self.date_file_map = generate_date_file_map(host=self.host) if len(self.date_file_map) == 0: @@ -188,7 +197,7 @@ class PubmedFTPWorker: print("{} FTP ingest caught up".format(self.name)) -def generate_date_file_map(host="ftp.ncbi.nlm.nih.gov"): +def generate_date_file_map(host: str = "ftp.ncbi.nlm.nih.gov") -> Dict[str, Any]: """ Generate a DefaultDict[string, set] mapping dates to absolute filepaths on the server (mostly we have one file, but sometimes more). @@ -259,7 +268,9 @@ def generate_date_file_map(host="ftp.ncbi.nlm.nih.gov"): return mapping -def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None): +def ftpretr( + url: str, max_retries: int = 10, retry_delay: int = 1, proxy_hostport: Optional[str] = None +) -> str: """ Note: This might move into a generic place in the future. @@ -305,8 +316,11 @@ def ftpretr(url, max_retries=10, retry_delay=1, proxy_hostport=None): def ftpretr_via_http_proxy( - url, proxy_hostport="ftp.ncbi.nlm.nih.gov", max_retries=10, retry_delay=1 -): + url: str, + proxy_hostport: str = "ftp.ncbi.nlm.nih.gov", + max_retries: int = 10, + retry_delay: int = 1, +) -> str: """ Fetch file from FTP via external HTTP proxy, e.g. ftp.host.com:/a/b/c would be retrievable via proxy.com/a/b/c; (in 09/2021 we used @@ -335,7 +349,7 @@ def ftpretr_via_http_proxy( time.sleep(retry_delay) -def xmlstream(filename, tag, encoding="utf-8"): +def xmlstream(filename: str, tag: str, encoding: str = "utf-8") -> Generator[Any, Any, Any]: """ Note: This might move into a generic place in the future. @@ -348,7 +362,7 @@ def xmlstream(filename, tag, encoding="utf-8"): Known vulnerabilities: https://docs.python.org/3/library/xml.html#xml-vulnerabilities """ - def strip_ns(tag): + def strip_ns(tag: str) -> str: if "}" not in tag: return tag return tag.split("}")[1] |