diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/db.py | 6 | ||||
-rw-r--r-- | python/sandcrawler/grobid.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/html.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 33 | ||||
-rw-r--r-- | python/sandcrawler/ingest_fileset.py | 5 | ||||
-rw-r--r-- | python/sandcrawler/pdfextract.py | 30 | ||||
-rw-r--r-- | python/sandcrawler/pdftrio.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 2 |
8 files changed, 55 insertions, 34 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index fed1024..05fedc6 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -108,7 +108,7 @@ class SandcrawlerPostgresClient: def commit(self) -> None: self.conn.commit() - def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]: + def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]: resp_codes = [int(r[0]) for r in resp] inserts = len([r for r in resp_codes if r == 0]) if on_conflict == "update": @@ -231,7 +231,7 @@ class SandcrawlerPostgresClient: def insert_pdf_meta(self, cur: psycopg2.extensions.cursor, - rows: List[Tuple[Any]], + rows: List[Tuple], on_conflict: str = "nothing") -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method @@ -271,7 +271,7 @@ class SandcrawlerPostgresClient: def insert_html_meta(self, cur: psycopg2.extensions.cursor, - rows: List[Tuple[Any]], + rows: List[Tuple], on_conflict: str = "nothing") -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index ae96fc8..f4d778f 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -14,7 +14,9 @@ class GrobidClient(object): self.host_url = host_url self.consolidate_mode = int(kwargs.get('consolidate_mode', 0)) - def process_fulltext(self, blob: bytes, consolidate_mode: Optional[int] = None) -> Dict[str, Any]: + def process_fulltext(self, + blob: bytes, + consolidate_mode: Optional[int] = None) -> Dict[str, Any]: """ Returns dict with keys: - status_code diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 5b9742a..abd3d50 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -2,6 +2,7 @@ import json import re import sys import urllib.parse +from typing import Dict from bs4 import BeautifulSoup @@ -12,7 +13,7 @@ OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";') SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';") -def extract_fulltext_url(html_url, html_body): +def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: """ Takes an HTML document (and URL), assumed to be a landing page, and tries to find a fulltext PDF url. @@ -335,12 +336,13 @@ def extract_fulltext_url(html_url, html_body): return dict() -def test_regex(): +def test_regex() -> None: lines = """ blah var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"; asdf""" m = OVID_JOURNAL_URL_REGEX.search(lines) + assert m assert m.group( 1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689" @@ -352,4 +354,5 @@ def test_regex(): """ url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client" m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines) + assert m assert m.group(1) == url diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 281c6d3..9faf98b 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -53,23 +53,32 @@ class IngestFileWorker(SandcrawlerWorker): process_file_hit(ResourceResult) -> response process_grobid(ResourceResult) """ - def __init__(self, sink=None, **kwargs): + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__() self.sink = sink - self.wayback_client = kwargs.get('wayback_client') - if not self.wayback_client: + + if kwargs.get('wayback_client'): + self.wayback_client: WaybackClient = kwargs['wayback_client'] + else: self.wayback_client = WaybackClient() - self.spn_client = kwargs.get('spn_client') - if not self.spn_client: + + if kwargs.get('spn_client'): + self.spn_client: SavePageNowClient = kwargs['spn_client'] + else: self.spn_client = SavePageNowClient( spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0)) - self.grobid_client = kwargs.get('grobid_client') - if not self.grobid_client: + + if kwargs.get('grobid_client'): + self.grobid_client: GrobidClient = kwargs['grobid_client'] + else: self.grobid_client = GrobidClient() - self.pgrest_client = kwargs.get('pgrest_client') - if not self.pgrest_client: + + if kwargs.get('pgrest_client'): + self.pgrest_client: SandcrawlerPostgrestClient = kwargs['pgrest_client'] + else: self.pgrest_client = SandcrawlerPostgrestClient() + self.grobid_sink = kwargs.get('grobid_sink') self.thumbnail_sink = kwargs.get('thumbnail_sink') self.pdftext_sink = kwargs.get('pdftext_sink') @@ -213,9 +222,9 @@ class IngestFileWorker(SandcrawlerWorker): return None def find_resource(self, - url, - best_mimetype=None, - force_recrawl=False) -> Optional[ResourceResult]: + url: str, + best_mimetype: Optional[str] = None, + force_recrawl: bool = False) -> Optional[ResourceResult]: """ Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index ea34948..defbeba 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -14,6 +14,7 @@ from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, Wayback WaybackError, cdx_to_dict, fix_transfer_encoding) from sandcrawler.ingest_file import IngestFileWorker from sandcrawler.misc import clean_url, gen_file_metadata +from sandcrawler.worker import SandcrawlerWorker MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024 @@ -31,7 +32,7 @@ class IngestFilesetWorker(IngestFileWorker): checking to see if content has been archived already) 4. summarize status """ - def __init__(self, sink=None, **kwargs): + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__(sink=None, **kwargs) self.sink = sink @@ -246,7 +247,7 @@ class IngestFilesetWorker(IngestFileWorker): base_url, force_recrawl=force_recrawl) result['request'] = request - if result.get('status') != None: + if result.get('status') is not None: result['request'] = request return result diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 222a408..d23d231 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -3,18 +3,19 @@ import json import sys from dataclasses import dataclass from io import BytesIO -from typing import Any, Dict, Optional +from typing import Any, Dict, List, Optional, Tuple import poppler from PIL import Image +from .ia import WaybackClient from .misc import gen_file_metadata from .workers import SandcrawlerFetchWorker, SandcrawlerWorker # This is a hack to work around timeouts when processing certain PDFs with # poppler. For some reason, the usual Kafka timeout catcher isn't working on # these, maybe due to threading. -BAD_PDF_SHA1HEX = [ +BAD_PDF_SHA1HEX: List[str] = [ "011478a1e63a2a31eae1a93832a74cc95f220760", "018dfe9824de6d2ac068ce0f7dc9961bffa1b558", "057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53", @@ -185,8 +186,8 @@ class PdfExtractResult: 'source': self.source, } - @classmethod - def from_pdftext_dict(cls, record): + @staticmethod + def from_pdftext_dict(record: Dict[str, Any]) -> 'PdfExtractResult': """ Outputs a JSON string as would be published to Kafka text/info topic. """ @@ -208,8 +209,8 @@ class PdfExtractResult: pdf_extra=record.get('pdf_extra'), ) - @classmethod - def from_pdf_meta_dict(cls, record): + @staticmethod + def from_pdf_meta_dict(record: Dict[str, Any]) -> 'PdfExtractResult': """ Parses what would be returned from postgrest """ @@ -270,7 +271,9 @@ class PdfExtractResult: ) -def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExtractResult: +def process_pdf(blob: bytes, + thumb_size: Tuple[int, int] = (180, 300), + thumb_type: str = "JPEG") -> PdfExtractResult: """ A known issue is that output text is in "physical layout" mode, which means columns will be side-by-side. We would prefer a single stream of tokens! @@ -418,13 +421,16 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt class PdfExtractWorker(SandcrawlerFetchWorker): - def __init__(self, wayback_client=None, sink=None, **kwargs): + def __init__(self, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs): super().__init__(wayback_client=wayback_client) self.wayback_client = wayback_client self.sink = sink self.thumbnail_sink = kwargs.get('thumbnail_sink') - def timeout_response(self, task) -> Dict: + def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]: default_key = task['sha1hex'] return dict( status="error-timeout", @@ -433,7 +439,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker): sha1hex=default_key, ) - def process(self, record, key: Optional[str] = None): + def process(self, record: Any, key: Optional[str] = None) -> dict: fetch_result = self.fetch_blob(record) if fetch_result['status'] != 'success': return fetch_result @@ -451,12 +457,12 @@ class PdfExtractBlobWorker(SandcrawlerWorker): This is sort of like PdfExtractWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, sink=None, **kwargs): + def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs): super().__init__() self.sink = sink self.thumbnail_sink = kwargs.get('thumbnail_sink') - def process(self, blob, key: Optional[str] = None): + def process(self, blob: Any, key: Optional[str] = None) -> Any: if not blob: return None assert isinstance(blob, bytes) diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 7b18367..7d39f0f 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -51,7 +51,7 @@ class PdfTrioClient(object): 'error_msg': 'pdftrio request connection timout', } - info = dict(status_code=pdftrio_response.status_code, ) + info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code) if pdftrio_response.status_code == 200: resp_json = pdftrio_response.json() assert 'ensemble_score' in resp_json diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 1b132ed..ba0358f 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -117,7 +117,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker): Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg, PDFs) from wayback, archive.org, or other sources. """ - def __init__(self, wayback_client: WaybackClient, **kwargs): + def __init__(self, wayback_client: Optional[WaybackClient], **kwargs): super().__init__(**kwargs) self.wayback_client = wayback_client |