From a39e4b864968fa73e475cc40af67203faef5236d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 17:56:44 -0700
Subject: more progress on type annotations

---
 python/sandcrawler/db.py             |  6 +++---
 python/sandcrawler/grobid.py         |  4 +++-
 python/sandcrawler/html.py           |  7 +++++--
 python/sandcrawler/ingest_file.py    | 33 +++++++++++++++++++++------------
 python/sandcrawler/ingest_fileset.py |  5 +++--
 python/sandcrawler/pdfextract.py     | 30 ++++++++++++++++++------------
 python/sandcrawler/pdftrio.py        |  2 +-
 python/sandcrawler/workers.py        |  2 +-
 8 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'python/sandcrawler')

diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index fed1024..05fedc6 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -108,7 +108,7 @@ class SandcrawlerPostgresClient:
     def commit(self) -> None:
         self.conn.commit()
 
-    def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]:
+    def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
         resp_codes = [int(r[0]) for r in resp]
         inserts = len([r for r in resp_codes if r == 0])
         if on_conflict == "update":
@@ -231,7 +231,7 @@ class SandcrawlerPostgresClient:
 
     def insert_pdf_meta(self,
                         cur: psycopg2.extensions.cursor,
-                        rows: List[Tuple[Any]],
+                        rows: List[Tuple],
                         on_conflict: str = "nothing") -> Tuple[int, int]:
         """
         batch elements are expected to have .to_sql_tuple() method
@@ -271,7 +271,7 @@ class SandcrawlerPostgresClient:
 
     def insert_html_meta(self,
                          cur: psycopg2.extensions.cursor,
-                         rows: List[Tuple[Any]],
+                         rows: List[Tuple],
                          on_conflict: str = "nothing") -> Tuple[int, int]:
         """
         batch elements are expected to have .to_sql_tuple() method
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index ae96fc8..f4d778f 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -14,7 +14,9 @@ class GrobidClient(object):
         self.host_url = host_url
         self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
 
-    def process_fulltext(self, blob: bytes, consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
+    def process_fulltext(self,
+                         blob: bytes,
+                         consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
         """
         Returns dict with keys:
             - status_code
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 5b9742a..abd3d50 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -2,6 +2,7 @@ import json
 import re
 import sys
 import urllib.parse
+from typing import Dict
 
 from bs4 import BeautifulSoup
 
@@ -12,7 +13,7 @@ OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
 SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
 
 
-def extract_fulltext_url(html_url, html_body):
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
     """
     Takes an HTML document (and URL), assumed to be a landing page, and tries
     to find a fulltext PDF url.
@@ -335,12 +336,13 @@ def extract_fulltext_url(html_url, html_body):
     return dict()
 
 
-def test_regex():
+def test_regex() -> None:
     lines = """
     blah
     var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
     asdf"""
     m = OVID_JOURNAL_URL_REGEX.search(lines)
+    assert m
     assert m.group(
         1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
 
@@ -352,4 +354,5 @@ def test_regex():
     """
     url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
     m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+    assert m
     assert m.group(1) == url
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 281c6d3..9faf98b 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -53,23 +53,32 @@ class IngestFileWorker(SandcrawlerWorker):
         process_file_hit(ResourceResult) -> response
         process_grobid(ResourceResult)
     """
-    def __init__(self, sink=None, **kwargs):
+    def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
         super().__init__()
 
         self.sink = sink
-        self.wayback_client = kwargs.get('wayback_client')
-        if not self.wayback_client:
+
+        if kwargs.get('wayback_client'):
+            self.wayback_client: WaybackClient = kwargs['wayback_client']
+        else:
             self.wayback_client = WaybackClient()
-        self.spn_client = kwargs.get('spn_client')
-        if not self.spn_client:
+
+        if kwargs.get('spn_client'):
+            self.spn_client: SavePageNowClient = kwargs['spn_client']
+        else:
             self.spn_client = SavePageNowClient(
                 spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
-        self.grobid_client = kwargs.get('grobid_client')
-        if not self.grobid_client:
+
+        if kwargs.get('grobid_client'):
+            self.grobid_client: GrobidClient = kwargs['grobid_client']
+        else:
             self.grobid_client = GrobidClient()
-        self.pgrest_client = kwargs.get('pgrest_client')
-        if not self.pgrest_client:
+
+        if kwargs.get('pgrest_client'):
+            self.pgrest_client: SandcrawlerPostgrestClient = kwargs['pgrest_client']
+        else:
             self.pgrest_client = SandcrawlerPostgrestClient()
+
         self.grobid_sink = kwargs.get('grobid_sink')
         self.thumbnail_sink = kwargs.get('thumbnail_sink')
         self.pdftext_sink = kwargs.get('pdftext_sink')
@@ -213,9 +222,9 @@ class IngestFileWorker(SandcrawlerWorker):
             return None
 
     def find_resource(self,
-                      url,
-                      best_mimetype=None,
-                      force_recrawl=False) -> Optional[ResourceResult]:
+                      url: str,
+                      best_mimetype: Optional[str] = None,
+                      force_recrawl: bool = False) -> Optional[ResourceResult]:
         """
         Looks in wayback for a resource starting at the URL, following any
         redirects. If a hit isn't found, try crawling with SPN.
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index ea34948..defbeba 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -14,6 +14,7 @@ from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, Wayback
                             WaybackError, cdx_to_dict, fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
 from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.worker import SandcrawlerWorker
 
 MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
 
@@ -31,7 +32,7 @@ class IngestFilesetWorker(IngestFileWorker):
        checking to see if content has been archived already)
     4. summarize status
     """
-    def __init__(self, sink=None, **kwargs):
+    def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
         super().__init__(sink=None, **kwargs)
 
         self.sink = sink
@@ -246,7 +247,7 @@ class IngestFilesetWorker(IngestFileWorker):
                                                  base_url,
                                                  force_recrawl=force_recrawl)
         result['request'] = request
-        if result.get('status') != None:
+        if result.get('status') is not None:
             result['request'] = request
             return result
 
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 222a408..d23d231 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -3,18 +3,19 @@ import json
 import sys
 from dataclasses import dataclass
 from io import BytesIO
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 import poppler
 from PIL import Image
 
+from .ia import WaybackClient
 from .misc import gen_file_metadata
 from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 # This is a hack to work around timeouts when processing certain PDFs with
 # poppler. For some reason, the usual Kafka timeout catcher isn't working on
 # these, maybe due to threading.
-BAD_PDF_SHA1HEX = [
+BAD_PDF_SHA1HEX: List[str] = [
     "011478a1e63a2a31eae1a93832a74cc95f220760",
     "018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
     "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
@@ -185,8 +186,8 @@ class PdfExtractResult:
             'source': self.source,
         }
 
-    @classmethod
-    def from_pdftext_dict(cls, record):
+    @staticmethod
+    def from_pdftext_dict(record: Dict[str, Any]) -> 'PdfExtractResult':
         """
         Outputs a JSON string as would be published to Kafka text/info topic.
         """
@@ -208,8 +209,8 @@ class PdfExtractResult:
                 pdf_extra=record.get('pdf_extra'),
             )
 
-    @classmethod
-    def from_pdf_meta_dict(cls, record):
+    @staticmethod
+    def from_pdf_meta_dict(record: Dict[str, Any]) -> 'PdfExtractResult':
         """
         Parses what would be returned from postgrest
         """
@@ -270,7 +271,9 @@ class PdfExtractResult:
         )
 
 
-def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(blob: bytes,
+                thumb_size: Tuple[int, int] = (180, 300),
+                thumb_type: str = "JPEG") -> PdfExtractResult:
     """
     A known issue is that output text is in "physical layout" mode, which means
     columns will be side-by-side. We would prefer a single stream of tokens!
@@ -418,13 +421,16 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt
 
 
 class PdfExtractWorker(SandcrawlerFetchWorker):
-    def __init__(self, wayback_client=None, sink=None, **kwargs):
+    def __init__(self,
+                 wayback_client: Optional[WaybackClient] = None,
+                 sink: Optional[SandcrawlerWorker] = None,
+                 **kwargs):
         super().__init__(wayback_client=wayback_client)
         self.wayback_client = wayback_client
         self.sink = sink
         self.thumbnail_sink = kwargs.get('thumbnail_sink')
 
-    def timeout_response(self, task) -> Dict:
+    def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
         default_key = task['sha1hex']
         return dict(
             status="error-timeout",
@@ -433,7 +439,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
             sha1hex=default_key,
         )
 
-    def process(self, record, key: Optional[str] = None):
+    def process(self, record: Any, key: Optional[str] = None) -> dict:
         fetch_result = self.fetch_blob(record)
         if fetch_result['status'] != 'success':
             return fetch_result
@@ -451,12 +457,12 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
     This is sort of like PdfExtractWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-    def __init__(self, sink=None, **kwargs):
+    def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
         super().__init__()
         self.sink = sink
         self.thumbnail_sink = kwargs.get('thumbnail_sink')
 
-    def process(self, blob, key: Optional[str] = None):
+    def process(self, blob: Any, key: Optional[str] = None) -> Any:
         if not blob:
             return None
         assert isinstance(blob, bytes)
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7b18367..7d39f0f 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -51,7 +51,7 @@ class PdfTrioClient(object):
                 'error_msg': 'pdftrio request connection timout',
             }
 
-        info = dict(status_code=pdftrio_response.status_code, )
+        info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
         if pdftrio_response.status_code == 200:
             resp_json = pdftrio_response.json()
             assert 'ensemble_score' in resp_json
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 1b132ed..ba0358f 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -117,7 +117,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
     Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg,
     PDFs) from wayback, archive.org, or other sources.
     """
-    def __init__(self, wayback_client: WaybackClient, **kwargs):
+    def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
         super().__init__(**kwargs)
         self.wayback_client = wayback_client
 
-- 
cgit v1.2.3