diff options
Diffstat (limited to 'python')
-rwxr-xr-x | python/fatcat_cleanup.py | 4 | ||||
-rwxr-xr-x | python/fatcat_export.py | 6 | ||||
-rwxr-xr-x | python/fatcat_harvest.py | 16 | ||||
-rwxr-xr-x | python/fatcat_import.py | 54 | ||||
-rwxr-xr-x | python/fatcat_ingest.py | 12 | ||||
-rwxr-xr-x | python/fatcat_review.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/api_auth.py | 15 | ||||
-rw-r--r-- | python/fatcat_tools/fcid.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/kafka.py | 10 | ||||
-rw-r--r-- | python/fatcat_tools/normal.py | 34 | ||||
-rw-r--r-- | python/fatcat_tools/references.py | 8 | ||||
-rw-r--r-- | python/fatcat_tools/reviewers/review_common.py | 87 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/csl.py | 14 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/entities.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 12 | ||||
-rwxr-xr-x | python/fatcat_transform.py | 12 | ||||
-rwxr-xr-x | python/fatcat_util.py | 10 | ||||
-rwxr-xr-x | python/fatcat_webface.py | 2 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 12 | ||||
-rw-r--r-- | python/tests/import_common.py | 2 | ||||
-rw-r--r-- | python/tests/transform_csl.py | 7 |
21 files changed, 200 insertions, 139 deletions
diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py index f8030b16..8bcc2ea9 100755 --- a/python/fatcat_cleanup.py +++ b/python/fatcat_cleanup.py @@ -14,7 +14,7 @@ from fatcat_tools.importers import JsonLinePusher sentry_client = raven.Client() -def run_files(args): +def run_files(args: argparse.Namespace) -> None: fmi = FileCleaner( args.api, dry_run_mode=args.dry_run, @@ -24,7 +24,7 @@ def run_files(args): JsonLinePusher(fmi, args.json_file).run() -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_export.py b/python/fatcat_export.py index ebdc5af8..7aac6a6e 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -14,7 +14,7 @@ import sys from fatcat_tools import entity_to_dict, public_api, uuid2fcid -def run_export_releases(args): +def run_export_releases(args: argparse.Namespace) -> None: for line in args.ident_file: ident = uuid2fcid(line.split()[0]) release = args.api.get_release(ident=ident, expand="all") @@ -23,7 +23,7 @@ def run_export_releases(args): ) -def run_export_changelog(args): +def run_export_changelog(args: argparse.Namespace) -> None: end = args.end if end is None: latest = args.api.get_changelog(limit=1)[0] @@ -36,7 +36,7 @@ def run_export_changelog(args): ) -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index 91356aad..be3cb888 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -19,7 +19,7 @@ from fatcat_tools.harvest import ( sentry_client = raven.Client() -def run_crossref(args): +def run_crossref(args: argparse.Namespace) -> None: worker = HarvestCrossrefWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.api-crossref", @@ -31,7 +31,7 @@ def run_crossref(args): worker.run(continuous=args.continuous) -def run_datacite(args): +def run_datacite(args: argparse.Namespace) -> None: worker = HarvestDataciteWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.api-datacite", @@ -43,7 +43,7 @@ def run_datacite(args): worker.run(continuous=args.continuous) -def run_arxiv(args): +def run_arxiv(args: argparse.Namespace) -> None: worker = HarvestArxivWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-arxiv", @@ -54,7 +54,7 @@ def run_arxiv(args): worker.run(continuous=args.continuous) -def run_pubmed(args): +def run_pubmed(args: argparse.Namespace) -> None: worker = PubmedFTPWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.ftp-pubmed", @@ -65,7 +65,7 @@ def run_pubmed(args): worker.run(continuous=args.continuous) -def run_doaj_article(args): +def run_doaj_article(args: argparse.Namespace) -> None: worker = HarvestDoajArticleWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-doaj-article", @@ -76,7 +76,7 @@ def run_doaj_article(args): worker.run(continuous=args.continuous) -def run_doaj_journal(args): +def run_doaj_journal(args: argparse.Namespace) -> None: worker = HarvestDoajJournalWorker( kafka_hosts=args.kafka_hosts, produce_topic=f"fatcat-{args.env}.oaipmh-doaj-journal", @@ -87,11 +87,11 @@ def run_doaj_journal(args): worker.run(continuous=args.continuous) -def mkdate(raw): +def mkdate(raw: str) -> datetime.date: return datetime.datetime.strptime(raw, "%Y-%m-%d").date() -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--kafka-hosts", diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 42001974..116df8b7 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -50,7 +50,7 @@ from fatcat_tools.importers import ( sentry_client = raven.Client() -def run_crossref(args): +def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( args.api, args.issn_map_file, @@ -71,12 +71,12 @@ def run_crossref(args): JsonLinePusher(fci, args.json_file).run() -def run_jalc(args): +def run_jalc(args: argparse.Namespace) -> None: ji = JalcImporter(args.api, args.issn_map_file, extid_map_file=args.extid_map_file) Bs4XmlLinesPusher(ji, args.xml_file, "<rdf:Description").run() -def run_arxiv(args): +def run_arxiv(args: argparse.Namespace) -> None: ari = ArxivRawImporter(args.api, edit_batch_size=args.batch_size) if args.kafka_mode: KafkaBs4XmlPusher( @@ -92,7 +92,7 @@ def run_arxiv(args): Bs4XmlFilePusher(ari, args.xml_file, "record").run() -def run_pubmed(args): +def run_pubmed(args: argparse.Namespace) -> None: pi = PubmedImporter( args.api, args.issn_map_file, @@ -116,27 +116,27 @@ def run_pubmed(args): ).run() -def run_jstor(args): +def run_jstor(args: argparse.Namespace) -> None: ji = JstorImporter(args.api, args.issn_map_file, edit_batch_size=args.batch_size) Bs4XmlFileListPusher(ji, args.list_file, "article").run() -def run_orcid(args): +def run_orcid(args: argparse.Namespace) -> None: foi = OrcidImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(foi, args.json_file).run() -def run_journal_metadata(args): +def run_journal_metadata(args: argparse.Namespace) -> None: fii = JournalMetadataImporter(args.api, edit_batch_size=args.batch_size) JsonLinePusher(fii, args.json_file).run() -def run_chocula(args): +def run_chocula(args: argparse.Namespace) -> None: fii = ChoculaImporter(args.api, edit_batch_size=args.batch_size, do_updates=args.do_updates) JsonLinePusher(fii, args.json_file).run() -def run_matched(args): +def run_matched(args: argparse.Namespace) -> None: fmi = MatchedImporter( args.api, edit_batch_size=args.batch_size, @@ -147,7 +147,7 @@ def run_matched(args): JsonLinePusher(fmi, args.json_file).run() -def run_arabesque_match(args): +def run_arabesque_match(args: argparse.Namespace) -> None: if (args.sqlite_file and args.json_file) or not (args.sqlite_file or args.json_file): print("Supply one of --sqlite-file or --json-file") ami = ArabesqueMatchImporter( @@ -166,7 +166,7 @@ def run_arabesque_match(args): JsonLinePusher(ami, args.json_file).run() -def run_ingest_file(args): +def run_ingest_file(args: argparse.Namespace) -> None: ifri = IngestFileResultImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -190,7 +190,7 @@ def run_ingest_file(args): JsonLinePusher(ifri, args.json_file).run() -def run_ingest_web(args): +def run_ingest_web(args: argparse.Namespace) -> None: iwri = IngestWebResultImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -214,7 +214,7 @@ def run_ingest_web(args): JsonLinePusher(iwri, args.json_file).run() -def run_ingest_fileset(args): +def run_ingest_fileset(args: argparse.Namespace) -> None: ifri = IngestFilesetResultImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -238,7 +238,7 @@ def run_ingest_fileset(args): JsonLinePusher(ifri, args.json_file).run() -def run_savepapernow_file(args): +def run_savepapernow_file(args: argparse.Namespace) -> None: ifri = SavePaperNowFileImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -259,7 +259,7 @@ def run_savepapernow_file(args): JsonLinePusher(ifri, args.json_file).run() -def run_savepapernow_web(args): +def run_savepapernow_web(args: argparse.Namespace) -> None: ifri = SavePaperNowWebImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -280,7 +280,7 @@ def run_savepapernow_web(args): JsonLinePusher(ifri, args.json_file).run() -def run_savepapernow_fileset(args): +def run_savepapernow_fileset(args: argparse.Namespace) -> None: ifri = SavePaperNowFilesetImporter( args.api, editgroup_description=args.editgroup_description_override, @@ -301,7 +301,7 @@ def run_savepapernow_fileset(args): JsonLinePusher(ifri, args.json_file).run() -def run_grobid_metadata(args): +def run_grobid_metadata(args: argparse.Namespace) -> None: fmi = GrobidMetadataImporter( args.api, edit_batch_size=args.batch_size, @@ -311,12 +311,12 @@ def run_grobid_metadata(args): LinePusher(fmi, args.tsv_file).run() -def run_shadow_lib(args): +def run_shadow_lib(args: argparse.Namespace) -> None: fmi = ShadowLibraryImporter(args.api, edit_batch_size=100) JsonLinePusher(fmi, args.json_file).run() -def run_wayback_static(args): +def run_wayback_static(args: argparse.Namespace) -> None: api = args.api # find the release @@ -348,7 +348,7 @@ def run_wayback_static(args): print("link: https://fatcat.wiki/webcapture/{}".format(wc.ident)) -def run_cdl_dash_dat(args): +def run_cdl_dash_dat(args: argparse.Namespace) -> None: api = args.api # create it @@ -363,7 +363,7 @@ def run_cdl_dash_dat(args): print("link: https://fatcat.wiki/fileset/{}".format(fs.ident)) -def run_datacite(args): +def run_datacite(args: argparse.Namespace) -> None: dci = DataciteImporter( args.api, args.issn_map_file, @@ -386,7 +386,7 @@ def run_datacite(args): JsonLinePusher(dci, args.json_file).run() -def run_doaj_article(args): +def run_doaj_article(args: argparse.Namespace) -> None: dai = DoajArticleImporter( args.api, args.issn_map_file, @@ -406,7 +406,7 @@ def run_doaj_article(args): JsonLinePusher(dai, args.json_file).run() -def run_dblp_release(args): +def run_dblp_release(args: argparse.Namespace) -> None: dri = DblpReleaseImporter( args.api, dblp_container_map_file=args.dblp_container_map_file, @@ -422,7 +422,7 @@ def run_dblp_release(args): ).run() -def run_dblp_container(args): +def run_dblp_container(args: argparse.Namespace) -> None: dci = DblpContainerImporter( args.api, args.issn_map_file, @@ -434,7 +434,7 @@ def run_dblp_container(args): JsonLinePusher(dci, args.json_file).run() -def run_file_meta(args): +def run_file_meta(args: argparse.Namespace) -> None: # do_updates defaults to true for this importer fmi = FileMetaImporter( args.api, @@ -444,7 +444,7 @@ def run_file_meta(args): JsonLinePusher(fmi, args.json_file).run() -def run_fileset(args): +def run_fileset(args: argparse.Namespace) -> None: fmi = FilesetImporter( args.api, edit_batch_size=100, @@ -453,7 +453,7 @@ def run_fileset(args): JsonLinePusher(fmi, args.json_file).run() -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--host-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 21597fae..71965f14 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -21,7 +21,7 @@ from fatcat_tools.transforms import release_ingest_request sentry_client = raven.Client() -def _init_search(args): +def _init_search(args: argparse.Namespace) -> Search: # ensure API connection works args.api.get_changelog() @@ -31,7 +31,7 @@ def _init_search(args): return search -def _run_search_dump(args, search): +def _run_search_dump(args: argparse.Namespace, search: Search) -> None: if args.dry_run: print("=== THIS IS A DRY RUN ===") @@ -122,7 +122,7 @@ def _run_search_dump(args, search): print("=== THIS WAS A DRY RUN ===") -def run_ingest_container(args): +def run_ingest_container(args: argparse.Namespace) -> None: """ This command queries elasticsearch for releases from a given container (eg, journal), and prepares ingest requests for them. @@ -151,7 +151,7 @@ def run_ingest_container(args): return _run_search_dump(args, search) -def run_ingest_query(args): +def run_ingest_query(args: argparse.Namespace) -> None: """ Accepts a free-form Lucene query language string. Intended to work the same way as searches in the fatcat web interface. @@ -173,7 +173,7 @@ def run_ingest_query(args): return _run_search_dump(args, search) -def run_ingest_extid(args): +def run_ingest_extid(args: argparse.Namespace) -> None: """ Selects release entities where the external identifier (extid) exists """ @@ -183,7 +183,7 @@ def run_ingest_extid(args): return _run_search_dump(args, search) -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_review.py b/python/fatcat_review.py index 0cdfc29d..7869eb60 100755 --- a/python/fatcat_review.py +++ b/python/fatcat_review.py @@ -12,7 +12,7 @@ from fatcat_tools.reviewers import DummyReviewBot sentry_client = raven.Client() -def run_dummy(args): +def run_dummy(args: argparse.Namespace) -> None: reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, verbose=args.verbose) if args.editgroup: annotation = reviewer.run_single(args.editgroup, args.annotate) @@ -21,7 +21,7 @@ def run_dummy(args): reviewer.run() -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--verbose", action="store_true", help="enable verbose output") parser.add_argument( diff --git a/python/fatcat_tools/api_auth.py b/python/fatcat_tools/api_auth.py index d8f0c46d..5eba583e 100644 --- a/python/fatcat_tools/api_auth.py +++ b/python/fatcat_tools/api_auth.py @@ -1,27 +1,28 @@ import os import sys +from typing import Optional -import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, Configuration, DefaultApi -def public_api(host_uri): +def public_api(host_uri: str) -> DefaultApi: """ Note: unlike the authenticated variant, this helper might get called even if the API isn't going to be used, so it's important that it doesn't try to actually connect to the API host or something. """ - conf = fatcat_openapi_client.Configuration() + conf = Configuration() conf.host = host_uri - return fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) + return DefaultApi(ApiClient(conf)) -def authenticated_api(host_uri, token=None): +def authenticated_api(host_uri: str, token: Optional[str] = None) -> DefaultApi: """ Note: if this helper is called, it's implied that an actual API connection is needed, so it does try to connect and verify credentials. """ - conf = fatcat_openapi_client.Configuration() + conf = Configuration() conf.host = host_uri if not token: token = os.environ["FATCAT_API_AUTH_TOKEN"] @@ -33,7 +34,7 @@ def authenticated_api(host_uri, token=None): conf.api_key["Authorization"] = token conf.api_key_prefix["Authorization"] = "Bearer" - api = fatcat_openapi_client.DefaultApi(fatcat_openapi_client.ApiClient(conf)) + api = DefaultApi(ApiClient(conf)) # verify up front that auth is working api.auth_check() diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py index 53891e5a..07463f62 100644 --- a/python/fatcat_tools/fcid.py +++ b/python/fatcat_tools/fcid.py @@ -2,17 +2,17 @@ import base64 import uuid -def fcid2uuid(s): +def fcid2uuid(fcid: str) -> str: """ Converts a fatcat identifier (base32 encoded string) to a uuid.UUID object """ - s = s.split("_")[-1].upper().encode("utf-8") - assert len(s) == 26 - raw = base64.b32decode(s + b"======") - return str(uuid.UUID(bytes=raw)).lower() + b = fcid.split("_")[-1].upper().encode("utf-8") + assert len(b) == 26 + raw_bytes = base64.b32decode(b + b"======") + return str(uuid.UUID(bytes=raw_bytes)).lower() -def uuid2fcid(s): +def uuid2fcid(s: str) -> str: """ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) """ @@ -20,6 +20,6 @@ def uuid2fcid(s): return base64.b32encode(raw)[:26].lower().decode("utf-8") -def test_fcid(): +def test_fcid() -> None: test_uuid = "00000000-0000-0000-3333-000000000001" assert test_uuid == fcid2uuid(uuid2fcid(test_uuid)) diff --git a/python/fatcat_tools/kafka.py b/python/fatcat_tools/kafka.py index 2a4451ad..fe9f36e9 100644 --- a/python/fatcat_tools/kafka.py +++ b/python/fatcat_tools/kafka.py @@ -1,7 +1,9 @@ +from typing import Any, Optional + from confluent_kafka import KafkaException, Producer -def kafka_fail_fast(err, msg): +def kafka_fail_fast(err: Optional[Any], _msg: Any) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") @@ -9,7 +11,11 @@ def kafka_fail_fast(err, msg): raise KafkaException(err) -def simple_kafka_producer(kafka_hosts): +def simple_kafka_producer(kafka_hosts: str) -> Producer: + """ + kafka_hosts should be a string with hostnames separated by ',', not a list + of hostnames + """ kafka_config = { "bootstrap.servers": kafka_hosts, diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index daf47ded..34e5c3d1 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -70,7 +70,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_doi(): +def test_clean_doi() -> None: assert clean_doi("10.1234/asdf ") == "10.1234/asdf" assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50" @@ -117,7 +117,7 @@ def clean_arxiv_id(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_arxiv_id(): +def test_clean_arxiv_id() -> None: assert clean_arxiv_id("0806.2878v1") == "0806.2878v1" assert clean_arxiv_id("0806.2878") == "0806.2878" assert clean_arxiv_id("1501.00001v1") == "1501.00001v1" @@ -146,7 +146,7 @@ def test_clean_arxiv_id(): assert clean_arxiv_id("08062878v1") is None -def clean_wikidata_qid(raw): +def clean_wikidata_qid(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip() @@ -157,7 +157,7 @@ def clean_wikidata_qid(raw): return None -def test_clean_wikidata_qid(): +def test_clean_wikidata_qid() -> None: assert clean_wikidata_qid("Q1234") == "Q1234" assert clean_wikidata_qid("Q1") == "Q1" assert clean_wikidata_qid(" Q1234 ") == "Q1234" @@ -181,7 +181,7 @@ def clean_pmid(raw: Optional[str]) -> Optional[str]: return None -def test_clean_pmid(): +def test_clean_pmid() -> None: assert clean_pmid("1234") == "1234" assert clean_pmid("1234 ") == "1234" assert clean_pmid("PMC123") is None @@ -214,7 +214,7 @@ def clean_sha1(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_sha1(): +def test_clean_sha1() -> None: assert ( clean_sha1("0fba3fba0e1937aa0297de3836b768b5dfb23d7b") == "0fba3fba0e1937aa0297de3836b768b5dfb23d7b" @@ -242,7 +242,7 @@ def clean_sha256(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_sha256(): +def test_clean_sha256() -> None: assert ( clean_sha256("6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f") == "6cc853f2ae75696b2e45f476c76b946b0fc2df7c52bb38287cb074aceb77bc7f" @@ -264,7 +264,7 @@ def clean_issn(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_issn(): +def test_clean_issn() -> None: assert clean_issn("1234-4567") == "1234-4567" assert clean_issn("1234-456X") == "1234-456X" assert clean_issn("134-4567") is None @@ -283,7 +283,7 @@ def clean_isbn13(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_isbn13(): +def test_clean_isbn13() -> None: assert clean_isbn13("978-1-56619-909-4") == "978-1-56619-909-4" assert clean_isbn13("978-1-4028-9462-6") == "978-1-4028-9462-6" assert clean_isbn13("978-1-56619-909-4 ") == "978-1-56619-909-4" @@ -302,7 +302,7 @@ def clean_orcid(raw: Optional[str]) -> Optional[str]: return raw -def test_clean_orcid(): +def test_clean_orcid() -> None: assert clean_orcid("0123-4567-3456-6789") == "0123-4567-3456-6789" assert clean_orcid("0123-4567-3456-678X") == "0123-4567-3456-678X" assert clean_orcid("0123-4567-3456-6789 ") == "0123-4567-3456-6789" @@ -313,7 +313,7 @@ def test_clean_orcid(): HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$") -def clean_hdl(raw): +def clean_hdl(raw: Optional[str]) -> Optional[str]: if not raw: return None raw = raw.strip().lower() @@ -332,7 +332,7 @@ def clean_hdl(raw): return raw -def test_clean_hdl(): +def test_clean_hdl() -> None: assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" assert ( @@ -373,7 +373,7 @@ def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: return fixed -def test_clean_str(): +def test_clean_str() -> None: assert clean_str(None) is None assert clean_str("") is None @@ -384,7 +384,7 @@ def test_clean_str(): assert clean_str("<b>a&b</b>", force_xml=True) == "<b>a&b</b>" -def b32_hex(s): +def b32_hex(s: str) -> str: s = s.strip().split()[0].lower() if s.startswith("sha1:"): s = s[5:] @@ -393,7 +393,7 @@ def b32_hex(s): return base64.b16encode(base64.b32decode(s.upper())).lower().decode("utf-8") -def is_cjk(s): +def is_cjk(s: Optional[str]) -> bool: if not s: return False for c in s: @@ -403,7 +403,7 @@ def is_cjk(s): return False -def test_is_cjk(): +def test_is_cjk() -> None: assert is_cjk(None) is False assert is_cjk("") is False assert is_cjk("blah") is False @@ -593,7 +593,7 @@ def parse_country_name(s: Optional[str]) -> Optional[str]: return None -def test_parse_country_name(): +def test_parse_country_name() -> None: assert parse_country_name("") is None assert parse_country_name("asdf blah") is None assert parse_country_name("us") == "us" diff --git a/python/fatcat_tools/references.py b/python/fatcat_tools/references.py index 624020b5..f41b5973 100644 --- a/python/fatcat_tools/references.py +++ b/python/fatcat_tools/references.py @@ -8,7 +8,7 @@ See bulk citation and citation API proposals for design documentation. import argparse import datetime import sys -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import elasticsearch from elasticsearch_dsl import Search @@ -65,7 +65,7 @@ class BiblioRef(BaseModel): target_unstructured: Optional[str] target_csl: Optional[Dict[str, Any]] - def hacks(self): + def hacks(self) -> "BiblioRef": """ Temporary (?) hacks to work around schema/data issues """ @@ -105,7 +105,7 @@ class EnrichedBiblioRef(BaseModel): @validator("release") @classmethod - def check_release(cls, v): + def check_release(cls: Any, v: ReleaseEntity) -> ReleaseEntity: if v is not None and not isinstance(v, ReleaseEntity): raise ValueError("expected a ReleaseEntity") return v @@ -399,7 +399,7 @@ def enrich_outbound_refs( return enriched -def run_ref_query(args) -> None: +def run_ref_query(args: argparse.Namespace) -> None: """ CLI helper/debug tool (prints to stdout) """ diff --git a/python/fatcat_tools/reviewers/review_common.py b/python/fatcat_tools/reviewers/review_common.py index 59ff1c4e..d599b31f 100644 --- a/python/fatcat_tools/reviewers/review_common.py +++ b/python/fatcat_tools/reviewers/review_common.py @@ -5,6 +5,7 @@ from collections import Counter from typing import Any, List, Optional import fatcat_openapi_client +from fatcat_openapi_client import ApiClient, Editgroup, EditgroupAnnotation, EntityEdit """ checks should return: @@ -29,7 +30,13 @@ class CheckResult: rev = None check_type = None - def __init__(self, status, check_type=None, description=None, **kwargs): + def __init__( + self, + status: str, + check_type: Optional[str] = None, + description: Optional[str] = None, + **kwargs + ): self.status = status self.check_type = check_type self.description = description @@ -45,36 +52,64 @@ class EditCheck: scope: List[Any] = [] name: Optional[str] = None - def check_editgroup(self, editgroup): + def check_editgroup(self, editgroup: fatcat_openapi_client.Editgroup) -> CheckResult: raise NotImplementedError - def check_container(self, edit, entity): + def check_container( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.ContainerEntity, + ) -> CheckResult: raise NotImplementedError - def check_creator(self, edit, entity): + def check_creator( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.CreatorEntity, + ) -> CheckResult: raise NotImplementedError - def check_file(self, edit, entity): + def check_file( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.FileEntity, + ) -> CheckResult: raise NotImplementedError - def check_fileset(self, edit, entity): + def check_fileset( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.FilesetEntity, + ) -> CheckResult: raise NotImplementedError - def check_webcapture(self, edit, entity): + def check_webcapture( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.WebcaptureEntity, + ) -> CheckResult: raise NotImplementedError - def check_release(self, edit, entity): + def check_release( + self, + edit: EntityEdit, + entity: fatcat_openapi_client.ReleaseEntity, + ) -> CheckResult: raise NotImplementedError - def check_work(self, edit, work): + def check_work( + self, + edit: EntityEdit, + work: fatcat_openapi_client.WorkEntity, + ) -> CheckResult: raise NotImplementedError class ReviewBot: - def __init__(self, api, verbose=False, **kwargs): + def __init__(self, api: fatcat_openapi_client.ApiClient, verbose: bool = False, **kwargs): self.api = api - self.checks = [] + self.checks: List[EditCheck] = [] self.verbose = verbose self.extra = kwargs.get("extra", dict()) self.extra["git_rev"] = self.extra.get( @@ -83,16 +118,18 @@ class ReviewBot: self.extra["agent"] = self.extra.get("agent", "fatcat_tools.ReviewBot") self.poll_interval = kwargs.get("poll_interval", 10.0) - def run_single(self, editgroup_id, annotate=True): + def run_single(self, editgroup_id: str, annotate: bool = True) -> CheckResult: eg = self.api.get_editgroup(editgroup_id) annotation = self.review_editgroup(eg) if annotate: self.api.create_editgroup_annotation(eg.editgroup_id, annotation) return annotation - def run(self, since=None): - if since is None: + def run(self, start_since: Optional[datetime.datetime] = None) -> None: + if start_since is None: since = datetime.datetime.utcnow() + else: + since = start_since while True: # XXX: better isoformat conversion? eg_list = self.api.get_editgroups_reviewable( @@ -116,7 +153,7 @@ class ReviewBot: # editgroups in the same second) since = since + datetime.timedelta(seconds=1) - def review_editgroup(self, editgroup): + def review_editgroup(self, editgroup: Editgroup) -> EditgroupAnnotation: results = self.run_checks(editgroup) result_counts = self.result_counts(results) disposition = self.disposition(results) @@ -159,20 +196,20 @@ class ReviewBot: ) return annotation - def result_counts(self, results): - counts = Counter() + def result_counts(self, results: List[CheckResult]) -> Counter: + counts: Counter = Counter() for result in results: counts["total"] += 1 counts[result.status] += 1 return counts - def disposition(self, results): + def disposition(self, results: List[CheckResult]) -> str: """ Returns one of: accept, revise, reject """ raise NotImplementedError - def run_checks(self, editgroup): + def run_checks(self, editgroup: Editgroup) -> List[CheckResult]: results = [] @@ -222,7 +259,7 @@ class DummyCheck(EditCheck): scope = ["editgroup", "work"] name = "DummyCheck" - def check_editgroup(self, editgroup): + def check_editgroup(self, editgroup: Editgroup) -> CheckResult: return CheckResult( "pass", "editgroup", @@ -231,7 +268,11 @@ class DummyCheck(EditCheck): ), ) - def check_work(self, entity, edit): + def check_work( + self, + edit: EntityEdit, + work: fatcat_openapi_client.WorkEntity, + ) -> CheckResult: return CheckResult("pass", "work", "this work edit is beautiful") @@ -240,9 +281,9 @@ class DummyReviewBot(ReviewBot): This bot reviews everything and always passes. """ - def __init__(self, api, **kwargs): + def __init__(self, api: ApiClient, **kwargs): super().__init__(api, **kwargs) self.checks = [DummyCheck()] - def disposition(self, results): + def disposition(self, results: List[CheckResult]) -> str: return "accept" diff --git a/python/fatcat_tools/transforms/csl.py b/python/fatcat_tools/transforms/csl.py index 2b39068a..03410ffb 100644 --- a/python/fatcat_tools/transforms/csl.py +++ b/python/fatcat_tools/transforms/csl.py @@ -1,4 +1,5 @@ import json +from typing import Any, Dict, List from citeproc import ( Citation, @@ -9,20 +10,21 @@ from citeproc import ( ) from citeproc.source.json import CiteProcJSON from citeproc_styles import get_style_filepath +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity -def contribs_by_role(contribs, role): +def contribs_by_role(contribs: List[ReleaseContrib], role: str) -> List[ReleaseContrib]: ret = [c.copy() for c in contribs if c["role"] == role] [c.pop("role") for c in ret] # TODO: some note to self here [c.pop("literal") for c in ret if "literal" in c] if not ret: - return None + return [] else: return ret -def release_to_csl(entity): +def release_to_csl(entity: ReleaseEntity) -> Dict[str, Any]: """ Returns a python dict which can be json.dumps() to get a CSL-JSON (aka, citeproc-JSON, aka Citation Style Language JSON) @@ -188,9 +190,9 @@ def release_to_csl(entity): return csl -def refs_to_csl(entity): +def refs_to_csl(entity: ReleaseEntity) -> List[Dict[str, Any]]: ret = [] - for ref in entity.refs: + for ref in entity.refs or []: if ref.release_id and False: # TODO: fetch full entity from API and convert with release_to_csl raise NotImplementedError @@ -207,7 +209,7 @@ def refs_to_csl(entity): return ret -def citeproc_csl(csl_json, style, html=False): +def citeproc_csl(csl_json: Dict[str, Any], style: str, html: bool = False) -> str: """ Renders a release entity to a styled citation. diff --git a/python/fatcat_tools/transforms/entities.py b/python/fatcat_tools/transforms/entities.py index ee4017d8..e5da633f 100644 --- a/python/fatcat_tools/transforms/entities.py +++ b/python/fatcat_tools/transforms/entities.py @@ -36,7 +36,9 @@ def entity_from_json( return api_client.deserialize(thing, entity_type) -def entity_from_dict(obj: Mapping[str, Any], entity_type, api_client=None): +def entity_from_dict( + obj: Mapping[str, Any], entity_type: Any, api_client: Optional[ApiClient] = None +) -> Any: json_str = json.dumps(obj) return entity_from_json(json_str, entity_type, api_client=api_client) diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 30b5b190..cbf9e9bf 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,4 +1,8 @@ -INGEST_TYPE_CONTAINER_MAP = { +from typing import Any, Dict, Optional + +from fatcat_openapi_client import ReleaseEntity + +INGEST_TYPE_CONTAINER_MAP: Dict[str, str] = { # Optica "twtpsm6ytje3nhuqfu3pa7ca7u": "html", # Optics Express @@ -14,7 +18,11 @@ INGEST_TYPE_CONTAINER_MAP = { } -def release_ingest_request(release, ingest_request_source="fatcat", ingest_type=None): +def release_ingest_request( + release: ReleaseEntity, + ingest_request_source: str = "fatcat", + ingest_type: Optional[str] = None, +) -> Optional[Dict[str, Any]]: """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. diff --git a/python/fatcat_transform.py b/python/fatcat_transform.py index fe2e12a6..67bf56c5 100755 --- a/python/fatcat_transform.py +++ b/python/fatcat_transform.py @@ -25,7 +25,7 @@ from fatcat_tools.transforms import ( from fatcat_web.search import get_elastic_container_stats -def run_elasticsearch_releases(args): +def run_elasticsearch_releases(args: argparse.Namespace) -> None: for line in args.json_input: line = line.strip() if not line: @@ -36,7 +36,7 @@ def run_elasticsearch_releases(args): args.json_output.write(json.dumps(release_to_elasticsearch(entity)) + "\n") -def run_elasticsearch_containers(args): +def run_elasticsearch_containers(args: argparse.Namespace) -> None: es_client = elasticsearch.Elasticsearch(args.fatcat_elasticsearch_url) es_release_index = "fatcat_release" for line in args.json_input: @@ -63,7 +63,7 @@ def run_elasticsearch_containers(args): args.json_output.write(json.dumps(es_doc) + "\n") -def run_elasticsearch_files(args): +def run_elasticsearch_files(args: argparse.Namespace) -> None: for line in args.json_input: line = line.strip() if not line: @@ -74,7 +74,7 @@ def run_elasticsearch_files(args): args.json_output.write(json.dumps(file_to_elasticsearch(entity)) + "\n") -def run_elasticsearch_changelogs(args): +def run_elasticsearch_changelogs(args: argparse.Namespace) -> None: for line in args.json_input: line = line.strip() if not line: @@ -83,7 +83,7 @@ def run_elasticsearch_changelogs(args): args.json_output.write(json.dumps(changelog_to_elasticsearch(entity)) + "\n") -def run_citeproc_releases(args): +def run_citeproc_releases(args: argparse.Namespace) -> None: for line in args.json_input: line = line.strip() if not line: @@ -97,7 +97,7 @@ def run_citeproc_releases(args): args.json_output.write(out + "\n") -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_util.py b/python/fatcat_util.py index 57102e9e..7e3d4260 100755 --- a/python/fatcat_util.py +++ b/python/fatcat_util.py @@ -13,24 +13,24 @@ import sys from fatcat_tools import authenticated_api, fcid2uuid, uuid2fcid -def run_uuid2fcid(args): +def run_uuid2fcid(args: argparse.Namespace) -> None: print(uuid2fcid(args.uuid)) -def run_fcid2uuid(args): +def run_fcid2uuid(args: argparse.Namespace) -> None: print(fcid2uuid(args.fcid)) -def run_editgroup_accept(args): +def run_editgroup_accept(args: argparse.Namespace) -> None: args.api.accept_editgroup(args.editgroup_id) -def run_editgroup_submit(args): +def run_editgroup_submit(args: argparse.Namespace) -> None: eg = args.api.get_editgroup(args.editgroup_id) args.api.update_editgroup(args.editgroup_id, eg, submit=True) -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port" diff --git a/python/fatcat_webface.py b/python/fatcat_webface.py index acaa5936..364ed345 100755 --- a/python/fatcat_webface.py +++ b/python/fatcat_webface.py @@ -5,7 +5,7 @@ import argparse from fatcat_web import app -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--debug", diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index b776e0ce..0e2d03b5 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -18,7 +18,7 @@ from fatcat_tools.workers import ( sentry_client = raven.Client() -def run_changelog(args): +def run_changelog(args: argparse.Namespace) -> None: topic = "fatcat-{}.changelog".format(args.env) worker = ChangelogWorker( args.api, args.kafka_hosts, topic, poll_interval=args.poll_interval @@ -26,7 +26,7 @@ def run_changelog(args): worker.run() -def run_entity_updates(args): +def run_entity_updates(args: argparse.Namespace) -> None: changelog_topic = "fatcat-{}.changelog".format(args.env) release_topic = "fatcat-{}.release-updates-v03".format(args.env) file_topic = "fatcat-{}.file-updates".format(args.env) @@ -46,7 +46,7 @@ def run_entity_updates(args): worker.run() -def run_elasticsearch_release(args): +def run_elasticsearch_release(args: argparse.Namespace) -> None: consume_topic = "fatcat-{}.release-updates-v03".format(args.env) worker = ElasticsearchReleaseWorker( args.kafka_hosts, @@ -57,7 +57,7 @@ def run_elasticsearch_release(args): worker.run() -def run_elasticsearch_container(args): +def run_elasticsearch_container(args: argparse.Namespace) -> None: consume_topic = "fatcat-{}.container-updates".format(args.env) worker = ElasticsearchContainerWorker( args.kafka_hosts, @@ -70,7 +70,7 @@ def run_elasticsearch_container(args): worker.run() -def run_elasticsearch_changelog(args): +def run_elasticsearch_changelog(args: argparse.Namespace) -> None: consume_topic = "fatcat-{}.changelog".format(args.env) worker = ElasticsearchChangelogWorker( args.kafka_hosts, @@ -81,7 +81,7 @@ def run_elasticsearch_changelog(args): worker.run() -def main(): +def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--api-host-url", default="http://localhost:9411/v0", help="fatcat API host/port to use" diff --git a/python/tests/import_common.py b/python/tests/import_common.py index cd89f914..6e66c295 100644 --- a/python/tests/import_common.py +++ b/python/tests/import_common.py @@ -6,7 +6,7 @@ import elasticsearch import fatcat_openapi_client import fuzzycat.matching import pytest -from fatcat_openapi_client import ReleaseEntity, ReleaseExtIds +from fatcat_openapi_client import ReleaseContrib, ReleaseEntity, ReleaseExtIds from fixtures import * from fatcat_tools.importers import EntityImporter diff --git a/python/tests/transform_csl.py b/python/tests/transform_csl.py index 2bf584ba..2b6fab31 100644 --- a/python/tests/transform_csl.py +++ b/python/tests/transform_csl.py @@ -1,4 +1,5 @@ import json +from typing import Any import pytest from fatcat_openapi_client import ReleaseEntity @@ -8,7 +9,7 @@ from import_crossref import crossref_importer from fatcat_tools.transforms import citeproc_csl, entity_from_json, release_to_csl -def test_csl_crossref(crossref_importer): +def test_csl_crossref(crossref_importer: Any) -> None: with open("tests/files/crossref-works.single.json", "r") as f: # not a single line raw = json.loads(f.read()) @@ -30,7 +31,7 @@ def test_csl_crossref(crossref_importer): citeproc_csl(csl, "csl-json") -def test_csl_pubmed(crossref_importer): +def test_csl_pubmed(crossref_importer: Any) -> None: with open("tests/files/example_releases_pubmed19n0972.json", "r") as f: # multiple single lines for line in f: @@ -42,7 +43,7 @@ def test_csl_pubmed(crossref_importer): citeproc_csl(csl, "harvard1", html=True) -def test_csl_pubmed_bibtex(crossref_importer): +def test_csl_pubmed_bibtex(crossref_importer: Any) -> None: with open("tests/files/example_releases_pubmed19n0972.json", "r") as f: r = entity_from_json(f.readline(), ReleaseEntity) csl = release_to_csl(r) |