aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/db.py6
-rw-r--r--python/sandcrawler/grobid.py4
-rw-r--r--python/sandcrawler/html.py7
-rw-r--r--python/sandcrawler/ingest_file.py33
-rw-r--r--python/sandcrawler/ingest_fileset.py5
-rw-r--r--python/sandcrawler/pdfextract.py30
-rw-r--r--python/sandcrawler/pdftrio.py2
-rw-r--r--python/sandcrawler/workers.py2
8 files changed, 55 insertions, 34 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index fed1024..05fedc6 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -108,7 +108,7 @@ class SandcrawlerPostgresClient:
def commit(self) -> None:
self.conn.commit()
- def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]:
+ def _inserts_and_updates(self, resp: List[Tuple], on_conflict: str) -> Tuple[int, int]:
resp_codes = [int(r[0]) for r in resp]
inserts = len([r for r in resp_codes if r == 0])
if on_conflict == "update":
@@ -231,7 +231,7 @@ class SandcrawlerPostgresClient:
def insert_pdf_meta(self,
cur: psycopg2.extensions.cursor,
- rows: List[Tuple[Any]],
+ rows: List[Tuple],
on_conflict: str = "nothing") -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
@@ -271,7 +271,7 @@ class SandcrawlerPostgresClient:
def insert_html_meta(self,
cur: psycopg2.extensions.cursor,
- rows: List[Tuple[Any]],
+ rows: List[Tuple],
on_conflict: str = "nothing") -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index ae96fc8..f4d778f 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -14,7 +14,9 @@ class GrobidClient(object):
self.host_url = host_url
self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
- def process_fulltext(self, blob: bytes, consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
+ def process_fulltext(self,
+ blob: bytes,
+ consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 5b9742a..abd3d50 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -2,6 +2,7 @@ import json
import re
import sys
import urllib.parse
+from typing import Dict
from bs4 import BeautifulSoup
@@ -12,7 +13,7 @@ OVID_JOURNAL_URL_REGEX = re.compile(r'journalURL = "(http.*)";')
SCIENCEDIRECT_BOUNCE_URL_REGEX = re.compile(r"window.location = '(http.*)';")
-def extract_fulltext_url(html_url, html_body):
+def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
@@ -335,12 +336,13 @@ def extract_fulltext_url(html_url, html_body):
return dict()
-def test_regex():
+def test_regex() -> None:
lines = """
blah
var journalURL = "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689";
asdf"""
m = OVID_JOURNAL_URL_REGEX.search(lines)
+ assert m
assert m.group(
1) == "https://journals.lww.com/co-urology/fulltext/10.1097/MOU.0000000000000689"
@@ -352,4 +354,5 @@ def test_regex():
"""
url = "https://pdf.sciencedirectassets.com/320270/AIP/1-s2.0-S2590109519300424/main.pdf?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEH0aCXVzLWVhc3QtMSJGMEQCICBF0dnrtKfpcs3T1kOjMS9w9gedqiLBrcbp4aKQSP8fAiAT9G426t6FWXHO2zPSXRFLq2eiqgbew2vkNKbcn87teyq9Awj1%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAIaDDA1OTAwMzU0Njg2NSIMnZcTRhbvMwF%2F5PA5KpEDdN%2FDI4V%2BNMDWQDFeAdUc99Lyxak%2B6vhAsfCBCf8hhvrRpalz75e74%2FXMAQwMN9m6i98o0Ljv9od7cuQEy8t%2B0DLzjzX5n3%2FxmpttowhMUm1jc8tBniLKBjwhTyiSHwhdeaVZf6x2zCJ0EIOWMNJHp3iFEqpaFvkRZbC1KWK4XPNNKo72HCvXuG7xmGrdHByz91AP7UgIYCy4hT10fnM43gbOE4wW8fqpgnvwCId%2F2u8k4rQoCLBqLYZzqshCRm1DBbsXCQhTwDXiMC2Ek3f63yKgw7rRCAxvs0vqirG%2B4mJ6LADaztAFMtKDPfnd4e%2B7%2FvnKU2NeotrqrkRgOkIAoFumbQXf20ky6mKWyHBk%2FxirVp60vUcLQpUm2Pcp6ythYxUi9IJxRGX8EF6aV4UHuCpUDUE7o8N84KUXIedUpytUZx7Xoxfk9w%2BR3%2FgX4LEHfkrWgiFAS3bVxNGOeV7GTwcXdcAggbdCaiAe46dfv7DDedx0KhVKOPH7obfvShqd6TYc0BjrV4sx61594ZJ3%2FO0ws7Lj8AU67AF17%2B1NZ3Ugu%2BwG9Ys9s7OxG8E4kBJ58vEY1yuBOQK9y2we4%2FTGPuqSxCuezqA%2BseslXYP%2FRc%2FZL9xx%2FUYaSjZhk1p1mhojxgBrckJYU7d8c4ELMPmtVy6R1yd2VDUoawEU8SB7nbNnMKzqQ3RgGgqGJiELys6dt%2FIr%2BVhpqM%2FZT4zadvzs8P%2FLoGzUHJKNZt0f99wLvZilphV92E%2BOUnwC4wbg3i3af3zozULwgEr7T%2FX2VsyREgexlzk76qMALPn0lgnciUyyQXxyUWAilXYQ0mQdXefh9lFfycczvt0UEuarX9p1sMwl8Ve5aw%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Date=20200110T210936Z&X-Amz-SignedHeaders=host&X-Amz-Expires=300&X-Amz-Credential=ASIAQ3PHCVTY23CMDBNC%2F20200110%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Signature=b43525576e1a0fdbab581481a3fe6db2862cbb2c69f2860b70cc8d444ccd73d5&hash=ccd128dfe597e704224bdfb4b3358de29b2be5d95887c71076bdab1236ba9e42&host=68042c943591013ac2b2430a89b270f6af2c76d8dfd086a07176afe7c76c2c61&pii=S2590109519300424&tid=spdf-74468ebd-6be6-43ac-b294-ced86e8eea58&sid=f9676d658285a749c46b6d081d965bb12aa8gxrqa&type=client"
m = SCIENCEDIRECT_BOUNCE_URL_REGEX.search(lines)
+ assert m
assert m.group(1) == url
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 281c6d3..9faf98b 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -53,23 +53,32 @@ class IngestFileWorker(SandcrawlerWorker):
process_file_hit(ResourceResult) -> response
process_grobid(ResourceResult)
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
- self.wayback_client = kwargs.get('wayback_client')
- if not self.wayback_client:
+
+ if kwargs.get('wayback_client'):
+ self.wayback_client: WaybackClient = kwargs['wayback_client']
+ else:
self.wayback_client = WaybackClient()
- self.spn_client = kwargs.get('spn_client')
- if not self.spn_client:
+
+ if kwargs.get('spn_client'):
+ self.spn_client: SavePageNowClient = kwargs['spn_client']
+ else:
self.spn_client = SavePageNowClient(
spn_cdx_retry_sec=kwargs.get('spn_cdx_retry_sec', 9.0))
- self.grobid_client = kwargs.get('grobid_client')
- if not self.grobid_client:
+
+ if kwargs.get('grobid_client'):
+ self.grobid_client: GrobidClient = kwargs['grobid_client']
+ else:
self.grobid_client = GrobidClient()
- self.pgrest_client = kwargs.get('pgrest_client')
- if not self.pgrest_client:
+
+ if kwargs.get('pgrest_client'):
+ self.pgrest_client: SandcrawlerPostgrestClient = kwargs['pgrest_client']
+ else:
self.pgrest_client = SandcrawlerPostgrestClient()
+
self.grobid_sink = kwargs.get('grobid_sink')
self.thumbnail_sink = kwargs.get('thumbnail_sink')
self.pdftext_sink = kwargs.get('pdftext_sink')
@@ -213,9 +222,9 @@ class IngestFileWorker(SandcrawlerWorker):
return None
def find_resource(self,
- url,
- best_mimetype=None,
- force_recrawl=False) -> Optional[ResourceResult]:
+ url: str,
+ best_mimetype: Optional[str] = None,
+ force_recrawl: bool = False) -> Optional[ResourceResult]:
"""
Looks in wayback for a resource starting at the URL, following any
redirects. If a hit isn't found, try crawling with SPN.
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index ea34948..defbeba 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -14,6 +14,7 @@ from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, Wayback
WaybackError, cdx_to_dict, fix_transfer_encoding)
from sandcrawler.ingest_file import IngestFileWorker
from sandcrawler.misc import clean_url, gen_file_metadata
+from sandcrawler.worker import SandcrawlerWorker
MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
@@ -31,7 +32,7 @@ class IngestFilesetWorker(IngestFileWorker):
checking to see if content has been archived already)
4. summarize status
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__(sink=None, **kwargs)
self.sink = sink
@@ -246,7 +247,7 @@ class IngestFilesetWorker(IngestFileWorker):
base_url,
force_recrawl=force_recrawl)
result['request'] = request
- if result.get('status') != None:
+ if result.get('status') is not None:
result['request'] = request
return result
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 222a408..d23d231 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -3,18 +3,19 @@ import json
import sys
from dataclasses import dataclass
from io import BytesIO
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional, Tuple
import poppler
from PIL import Image
+from .ia import WaybackClient
from .misc import gen_file_metadata
from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
# This is a hack to work around timeouts when processing certain PDFs with
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
# these, maybe due to threading.
-BAD_PDF_SHA1HEX = [
+BAD_PDF_SHA1HEX: List[str] = [
"011478a1e63a2a31eae1a93832a74cc95f220760",
"018dfe9824de6d2ac068ce0f7dc9961bffa1b558",
"057c7a9dfb611bfd52f7de6c39b2d5757c5e4e53",
@@ -185,8 +186,8 @@ class PdfExtractResult:
'source': self.source,
}
- @classmethod
- def from_pdftext_dict(cls, record):
+ @staticmethod
+ def from_pdftext_dict(record: Dict[str, Any]) -> 'PdfExtractResult':
"""
Outputs a JSON string as would be published to Kafka text/info topic.
"""
@@ -208,8 +209,8 @@ class PdfExtractResult:
pdf_extra=record.get('pdf_extra'),
)
- @classmethod
- def from_pdf_meta_dict(cls, record):
+ @staticmethod
+ def from_pdf_meta_dict(record: Dict[str, Any]) -> 'PdfExtractResult':
"""
Parses what would be returned from postgrest
"""
@@ -270,7 +271,9 @@ class PdfExtractResult:
)
-def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExtractResult:
+def process_pdf(blob: bytes,
+ thumb_size: Tuple[int, int] = (180, 300),
+ thumb_type: str = "JPEG") -> PdfExtractResult:
"""
A known issue is that output text is in "physical layout" mode, which means
columns will be side-by-side. We would prefer a single stream of tokens!
@@ -418,13 +421,16 @@ def process_pdf(blob: bytes, thumb_size=(180, 300), thumb_type="JPEG") -> PdfExt
class PdfExtractWorker(SandcrawlerFetchWorker):
- def __init__(self, wayback_client=None, sink=None, **kwargs):
+ def __init__(self,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs):
super().__init__(wayback_client=wayback_client)
self.wayback_client = wayback_client
self.sink = sink
self.thumbnail_sink = kwargs.get('thumbnail_sink')
- def timeout_response(self, task) -> Dict:
+ def timeout_response(self, task: Dict[str, Any]) -> Dict[str, Any]:
default_key = task['sha1hex']
return dict(
status="error-timeout",
@@ -433,7 +439,7 @@ class PdfExtractWorker(SandcrawlerFetchWorker):
sha1hex=default_key,
)
- def process(self, record, key: Optional[str] = None):
+ def process(self, record: Any, key: Optional[str] = None) -> dict:
fetch_result = self.fetch_blob(record)
if fetch_result['status'] != 'success':
return fetch_result
@@ -451,12 +457,12 @@ class PdfExtractBlobWorker(SandcrawlerWorker):
This is sort of like PdfExtractWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, sink=None, **kwargs):
+ def __init__(self, sink: Optional[SandcrawlerWorker] = None, **kwargs):
super().__init__()
self.sink = sink
self.thumbnail_sink = kwargs.get('thumbnail_sink')
- def process(self, blob, key: Optional[str] = None):
+ def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
assert isinstance(blob, bytes)
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 7b18367..7d39f0f 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -51,7 +51,7 @@ class PdfTrioClient(object):
'error_msg': 'pdftrio request connection timout',
}
- info = dict(status_code=pdftrio_response.status_code, )
+ info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
if pdftrio_response.status_code == 200:
resp_json = pdftrio_response.json()
assert 'ensemble_score' in resp_json
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 1b132ed..ba0358f 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -117,7 +117,7 @@ class SandcrawlerFetchWorker(SandcrawlerWorker):
Wrapper of SandcrawlerWorker that adds a helper method to fetch blobs (eg,
PDFs) from wayback, archive.org, or other sources.
"""
- def __init__(self, wayback_client: WaybackClient, **kwargs):
+ def __init__(self, wayback_client: Optional[WaybackClient], **kwargs):
super().__init__(**kwargs)
self.wayback_client = wayback_client