From 4a46f166f8514b5620d2bcb13a5c5f3e6cee66c8 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 16:59:32 -0700
Subject: more progress on type annotations and linting

---
 python/ia_pdf_match.py                  |  9 +++++----
 python/sandcrawler/db.py                | 22 +++++++++++-----------
 python/sandcrawler/fileset_platforms.py |  7 ++++---
 python/sandcrawler/html_metadata.py     | 25 +++++++++++++------------
 python/sandcrawler/ia.py                |  4 ++--
 python/sandcrawler/ingest_file.py       |  2 ++
 python/sandcrawler/minio.py             | 24 ++++++++++++++++++++----
 python/sandcrawler/misc.py              | 16 ++++++++--------
 python/sandcrawler/pdftrio.py           | 29 ++++++++++++++++++++---------
 python/tests/test_ingest.py             |  2 +-
 python/tests/test_misc.py               |  2 +-
 11 files changed, 87 insertions(+), 55 deletions(-)

(limited to 'python')

diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index c3d9c16..ac17003 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -23,9 +23,10 @@ When invoking import matched, be sure to:
 
 import json
 import sys
+from typing import Any, Dict, Optional
 
 
-def parse(obj):
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
     if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
         print('skip: test item', file=sys.stderr)
         return None
@@ -42,7 +43,7 @@ def parse(obj):
         extid = extid.replace('http://arxiv.org/abs/', '')
         #print(extid)
         assert '/' in extid or '.' in extid
-        if not 'v' in extid or not extid[-1].isdigit():
+        if 'v' not in extid or not extid[-1].isdigit():
             print('skip: non-versioned arxiv_id', file=sys.stderr)
             return None
     elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
@@ -97,13 +98,13 @@ def parse(obj):
     return match
 
 
-def run():
+def run() -> None:
     for line in sys.stdin:
         if not line:
             continue
         obj = json.loads(line)
         match = parse(obj)
-        if match:
+        if match is not None:
             print(json.dumps(match, sort_keys=True))
 
 
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ca2657..fed1024 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -99,7 +99,7 @@ class SandcrawlerPostgrestClient:
 
 
 class SandcrawlerPostgresClient:
-    def __init__(self, db_url, **kwargs):
+    def __init__(self, db_url: str, **kwargs):
         self.conn = psycopg2.connect(db_url)
 
     def cursor(self) -> psycopg2.extensions.cursor:
@@ -108,7 +108,7 @@ class SandcrawlerPostgresClient:
     def commit(self) -> None:
         self.conn.commit()
 
-    def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str):
+    def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]:
         resp_codes = [int(r[0]) for r in resp]
         inserts = len([r for r in resp_codes if r == 0])
         if on_conflict == "update":
@@ -120,7 +120,7 @@ class SandcrawlerPostgresClient:
     def insert_cdx(self,
                    cur: psycopg2.extensions.cursor,
                    batch: List[Dict[str, Any]],
-                   on_conflict: str = "nothing"):
+                   on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
@@ -149,7 +149,7 @@ class SandcrawlerPostgresClient:
     def insert_file_meta(self,
                          cur: psycopg2.extensions.cursor,
                          batch: List[Dict[str, Any]],
-                         on_conflict: str = "nothing"):
+                         on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
@@ -181,7 +181,7 @@ class SandcrawlerPostgresClient:
     def insert_grobid(self,
                       cur: psycopg2.extensions.cursor,
                       batch: List[Dict[str, Any]],
-                      on_conflict: str = "nothing"):
+                      on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
@@ -232,7 +232,7 @@ class SandcrawlerPostgresClient:
     def insert_pdf_meta(self,
                         cur: psycopg2.extensions.cursor,
                         rows: List[Tuple[Any]],
-                        on_conflict: str = "nothing"):
+                        on_conflict: str = "nothing") -> Tuple[int, int]:
         """
         batch elements are expected to have .to_sql_tuple() method
         """
@@ -272,7 +272,7 @@ class SandcrawlerPostgresClient:
     def insert_html_meta(self,
                          cur: psycopg2.extensions.cursor,
                          rows: List[Tuple[Any]],
-                         on_conflict: str = "nothing"):
+                         on_conflict: str = "nothing") -> Tuple[int, int]:
         """
         batch elements are expected to have .to_sql_tuple() method
         """
@@ -309,7 +309,7 @@ class SandcrawlerPostgresClient:
     def insert_pdftrio(self,
                        cur: psycopg2.extensions.cursor,
                        batch: List[Dict[str, Any]],
-                       on_conflict: str = "nothing"):
+                       on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
@@ -358,7 +358,7 @@ class SandcrawlerPostgresClient:
     def insert_ingest_request(self,
                               cur: psycopg2.extensions.cursor,
                               batch: List[Dict[str, Any]],
-                              on_conflict: str = "nothing"):
+                              on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
@@ -398,7 +398,7 @@ class SandcrawlerPostgresClient:
     def insert_ingest_file_result(self,
                                   cur: psycopg2.extensions.cursor,
                                   batch: List[Dict[str, Any]],
-                                  on_conflict: str = "nothing"):
+                                  on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
@@ -441,7 +441,7 @@ class SandcrawlerPostgresClient:
     def insert_ingest_fileset_platform(self,
                                        cur: psycopg2.extensions.cursor,
                                        batch: List[Dict[str, Any]],
-                                       on_conflict: str = "nothing"):
+                                       on_conflict: str = "nothing") -> Tuple[int, int]:
         sql = """
             INSERT INTO
             ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index c97e639..6d66d81 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -4,7 +4,8 @@ from typing import Optional, Tuple
 import internetarchive
 import requests
 
-from sandcrawler.fileset_types import *
+from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+                                       PlatformRestrictedError, PlatformScopeError)
 from sandcrawler.html_metadata import BiblioMetadata
 from sandcrawler.ia import ResourceResult
 
@@ -262,7 +263,7 @@ class DataverseHelper(FilesetPlatformHelper):
         )
 
 
-def test_parse_dataverse_persistentid():
+def test_parse_dataverse_persistentid() -> None:
 
     valid = {
         "doi:10.25625/LL6WXZ": {
@@ -465,7 +466,7 @@ class FigshareHelper(FilesetPlatformHelper):
         )
 
 
-def test_parse_figshare_url_path():
+def test_parse_figshare_url_path() -> None:
 
     valid = {
         "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1":
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 15a9f2b..ab0fd61 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,7 +1,7 @@
 import datetime
 import sys
 import urllib.parse
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
 
 import braveblock
 import dateparser
@@ -20,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal
 # order of these are mostly by preference/quality (best option first), though
 # also/sometimes re-ordered for lookup efficiency (lookup stops after first
 # match)
-HEAD_META_PATTERNS: Any = {
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
     "title": [
         "meta[name='citation_title']",
         "meta[name='eprints.title']",
@@ -151,7 +151,7 @@ HEAD_META_PATTERNS: Any = {
     ],
 }
 
-HEAD_META_LIST_PATTERNS: Any = {
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
     "contrib_names": [
         "meta[name='citation_author']",
         "meta[name='bepress_citation_author']",
@@ -170,7 +170,7 @@ HEAD_META_LIST_PATTERNS: Any = {
     ],
 }
 
-XML_FULLTEXT_PATTERNS: List[dict] = [
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "meta[name='citation_xml_url']",
         "attr": "content",
@@ -222,7 +222,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
     },
 ]
 
-HTML_FULLTEXT_PATTERNS: List[dict] = [
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "meta[name='citation_fulltext_html_url']",
         "attr": "content",
@@ -249,7 +249,7 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
     },
 ]
 
-COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "in_doc_url": "pensoft.net/article/",  # also /element/
         "in_fulltext_url": "/download/fig/",
@@ -262,7 +262,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
         "in_doc_url": "/file.xhtml?persistentId",
         "in_fulltext_url": "/access/datafile/",
         "selector": "div.form-group code",
-        "use_body": True,
+        "use_body": "true",
         "technique": "Dataverse 'download URL'",
         "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
     },
@@ -270,7 +270,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
 
 # This is a database of matching patterns. Most of these discovered by hand,
 # looking at OA journal content that failed to craw/ingest.
-PDF_FULLTEXT_PATTERNS: List[dict] = [
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "head meta[name='citation_pdf_url']",
         "attr": "content",
@@ -591,14 +591,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
     },
 ]
 
-FULLTEXT_URL_PATTERNS_SKIP = [
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
     # wiley has a weird almost-blank page we don't want to loop on
     "://onlinelibrary.wiley.com/doi/pdf/"
     "://doi.org/"
     "://dx.doi.org/"
 ]
 
-RELEASE_TYPE_MAP = {
+RELEASE_TYPE_MAP: Dict[str, str] = {
     "research article": "article-journal",
     "text.serial.journal": "article-journal",
 }
@@ -807,7 +807,8 @@ def load_adblock_rules() -> braveblock.Adblocker:
     )
 
 
-def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str],
+                     type_name: str) -> List[Dict[str, str]]:
     resources = []
 
     for node in doc.css(selector):
@@ -831,7 +832,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name
 
 
 def html_extract_resources(doc_url: str, doc: HTMLParser,
-                           adblock: braveblock.Adblocker) -> list:
+                           adblock: braveblock.Adblocker) -> List[Dict[str, str]]:
     """
     This function tries to find all the important resources in a page. The
     presumption is that the HTML document is article fulltext, and we want the
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 04a1e3b..aa4752e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -314,7 +314,7 @@ class CdxApiClient:
         if not rows:
             return None
 
-        def _cdx_sort_key(r):
+        def _cdx_sort_key(r: CdxRow) -> tuple:
             """
             This is a function, not a lambda, because it captures
             best_mimetype. Will create a tuple that can be used to sort in
@@ -901,7 +901,7 @@ class SavePageNowClient:
     def save_url_now_v2(self,
                         request_url: str,
                         force_simple_get: Optional[int] = None,
-                        capture_outlinks: int = 0):
+                        capture_outlinks: int = 0) -> SavePageNowResult:
         """
         Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
         at all, or raises an exception if there was an error with SPN itself.
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index bc8643b..281c6d3 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -364,6 +364,8 @@ class IngestFileWorker(SandcrawlerWorker):
 
         # Need to actually processes
         result = process_pdf(resource.body)
+        assert result.sha1hex == file_meta['sha1hex']
+        assert result.file_meta is not None
         assert result.file_meta['sha1hex'] == file_meta['sha1hex']
         if self.thumbnail_sink and result.page0_thumbnail is not None:
             self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 046db9e..1967ba3 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,11 +1,16 @@
 import hashlib
 import io
+from typing import Optional, Tuple, Union
 
 import minio
 
 
 class SandcrawlerMinioClient(object):
-    def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+    def __init__(self,
+                 host_url: str,
+                 access_key: str,
+                 secret_key: str,
+                 default_bucket: Optional[str] = None):
         """
         host is minio connection string (host:port)
         access and secret key are as expected
@@ -25,7 +30,7 @@ class SandcrawlerMinioClient(object):
         )
         self.default_bucket = default_bucket
 
-    def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+    def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
         if not extension:
             extension = ""
         if not prefix:
@@ -41,7 +46,13 @@ class SandcrawlerMinioClient(object):
         )
         return obj_path
 
-    def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+    def put_blob(self,
+                 folder: str,
+                 blob: Union[str, bytes],
+                 sha1hex: Optional[str] = None,
+                 extension: str = "",
+                 prefix: str = "",
+                 bucket: Optional[str] = None) -> Tuple[str, str]:
         """
         blob should be bytes
         sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -78,7 +89,12 @@ class SandcrawlerMinioClient(object):
         )
         return (bucket, obj_path)
 
-    def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+    def get_blob(self,
+                 folder: str,
+                 sha1hex: str,
+                 extension: str = "",
+                 prefix: str = "",
+                 bucket: str = None) -> bytes:
         """
         sha1hex is sha1 of the blob itself
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 5ca7a4b..83a4626 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,7 +2,7 @@ import base64
 import datetime
 import hashlib
 import os
-from typing import Optional
+from typing import List, Optional
 
 import magic
 import requests
@@ -166,7 +166,7 @@ def normalize_mime(raw: str) -> Optional[str]:
     return None
 
 
-def test_normalize_mime():
+def test_normalize_mime() -> None:
     assert normalize_mime("asdf") is None
     assert normalize_mime("application/pdf") == "application/pdf"
     assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -179,7 +179,7 @@ def test_normalize_mime():
     assert normalize_mime("binary/octet-stream") == "application/octet-stream"
 
 
-def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
     """
     This method always filters a few things out:
 
@@ -241,7 +241,7 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
 def test_parse_cdx_datetime() -> None:
     assert parse_cdx_datetime("") is None
     assert parse_cdx_datetime("asdf") is None
-    assert parse_cdx_datetime("19930203123045") != None
+    assert parse_cdx_datetime("19930203123045") is not None
     assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
                                                                      month=10,
                                                                      day=28,
@@ -266,10 +266,10 @@ def test_datetime_to_cdx() -> None:
         datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
 
 
-def requests_retry_session(retries=10,
-                           backoff_factor=3,
-                           status_forcelist=(500, 502, 504),
-                           session=None) -> requests.Session:
+def requests_retry_session(retries: int = 10,
+                           backoff_factor: int = 3,
+                           status_forcelist: List[int] = [500, 502, 504],
+                           session: requests.Session = None) -> requests.Session:
     """
     From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
     """
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index ba875cd..7b18367 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,17 +1,19 @@
 import time
+from typing import Any, Dict, Optional
 
 import requests
 
+from .ia import WaybackClient
 from .misc import gen_file_metadata, requests_retry_session
 from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class PdfTrioClient(object):
-    def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+    def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
         self.http_session = requests_retry_session(retries=3, backoff_factor=3)
 
-    def classify_pdf(self, blob, mode="auto"):
+    def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
         """
         Returns a dict with at least:
 
@@ -24,7 +26,7 @@ class PdfTrioClient(object):
         appropriately; an optional `error_msg` may also be set. For some other
         errors, like connection failure, an exception is raised.
         """
-        assert blob
+        assert blob and type(blob) == bytes
 
         try:
             pdftrio_response = requests.post(
@@ -68,12 +70,16 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
     """
     This class is basically copied directly from GrobidWorker
     """
-    def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
-        super().__init__(wayback_client=wayback_client)
+    def __init__(self,
+                 pdftrio_client: PdfTrioClient,
+                 wayback_client: Optional[WaybackClient] = None,
+                 sink: Optional[SandcrawlerWorker] = None,
+                 **kwargs):
+        super().__init__(wayback_client=wayback_client, **kwargs)
         self.pdftrio_client = pdftrio_client
         self.sink = sink
 
-    def process(self, record, key=None):
+    def process(self, record: Any, key: str = None) -> Any:
         start_process = time.time()
         fetch_sec = None
 
@@ -103,16 +109,21 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
     This is sort of like PdfTrioWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-    def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
-        super().__init__()
+    def __init__(self,
+                 pdftrio_client: PdfTrioClient,
+                 sink: Optional[SandcrawlerWorker] = None,
+                 mode: str = "auto",
+                 **kwargs):
+        super().__init__(**kwargs)
         self.pdftrio_client = pdftrio_client
         self.sink = sink
         self.mode = mode
 
-    def process(self, blob, key=None):
+    def process(self, blob: Any, key: str = None) -> Any:
         start_process = time.time()
         if not blob:
             return None
+        assert isinstance(blob, bytes)
         result = dict()
         result['file_meta'] = gen_file_metadata(blob)
         result['key'] = result['file_meta']['sha1hex']
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index f2318c2..617f2b4 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -105,7 +105,7 @@ def test_ingest_success(ingest_worker_pdf):
     assert 'fatcat_release' in resp['grobid']
     assert 'grobid_version' not in resp['grobid']['metadata']
     assert 'fatcat_release' not in resp['grobid']['metadata']
-    assert not 'tei_xml' in resp['grobid']
+    assert 'tei_xml' not in resp['grobid']
     assert resp['pdf_meta']['status'] == "success"
     assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
     assert resp['pdf_meta'].get('text') is None
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 7d3e755..5830dc9 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -87,7 +87,7 @@ def test_invalid_cdx():
 
     print("bad datetime")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
-    assert parse_cdx_line(raw) == None
+    assert parse_cdx_line(raw) is None
 
 
 def test_clean_url():
-- 
cgit v1.2.3