diff options
Diffstat (limited to 'python')
-rwxr-xr-x | python/ia_pdf_match.py | 9 | ||||
-rw-r--r-- | python/sandcrawler/db.py | 22 | ||||
-rw-r--r-- | python/sandcrawler/fileset_platforms.py | 7 | ||||
-rw-r--r-- | python/sandcrawler/html_metadata.py | 25 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_file.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/minio.py | 24 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 16 | ||||
-rw-r--r-- | python/sandcrawler/pdftrio.py | 29 | ||||
-rw-r--r-- | python/tests/test_ingest.py | 2 | ||||
-rw-r--r-- | python/tests/test_misc.py | 2 |
11 files changed, 87 insertions, 55 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py index c3d9c16..ac17003 100755 --- a/python/ia_pdf_match.py +++ b/python/ia_pdf_match.py @@ -23,9 +23,10 @@ When invoking import matched, be sure to: import json import sys +from typing import Any, Dict, Optional -def parse(obj): +def parse(obj: dict) -> Optional[Dict[str, Any]]: if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'): print('skip: test item', file=sys.stderr) return None @@ -42,7 +43,7 @@ def parse(obj): extid = extid.replace('http://arxiv.org/abs/', '') #print(extid) assert '/' in extid or '.' in extid - if not 'v' in extid or not extid[-1].isdigit(): + if 'v' not in extid or not extid[-1].isdigit(): print('skip: non-versioned arxiv_id', file=sys.stderr) return None elif obj['metadata']['identifier'].startswith('paper-doi-10_'): @@ -97,13 +98,13 @@ def parse(obj): return match -def run(): +def run() -> None: for line in sys.stdin: if not line: continue obj = json.loads(line) match = parse(obj) - if match: + if match is not None: print(json.dumps(match, sort_keys=True)) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 3ca2657..fed1024 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -99,7 +99,7 @@ class SandcrawlerPostgrestClient: class SandcrawlerPostgresClient: - def __init__(self, db_url, **kwargs): + def __init__(self, db_url: str, **kwargs): self.conn = psycopg2.connect(db_url) def cursor(self) -> psycopg2.extensions.cursor: @@ -108,7 +108,7 @@ class SandcrawlerPostgresClient: def commit(self) -> None: self.conn.commit() - def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str): + def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]: resp_codes = [int(r[0]) for r in resp] inserts = len([r for r in resp_codes if r == 0]) if on_conflict == "update": @@ -120,7 +120,7 @@ class SandcrawlerPostgresClient: def insert_cdx(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset) @@ -149,7 +149,7 @@ class SandcrawlerPostgresClient: def insert_file_meta(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype) @@ -181,7 +181,7 @@ class SandcrawlerPostgresClient: def insert_grobid(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata) @@ -232,7 +232,7 @@ class SandcrawlerPostgresClient: def insert_pdf_meta(self, cur: psycopg2.extensions.cursor, rows: List[Tuple[Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method """ @@ -272,7 +272,7 @@ class SandcrawlerPostgresClient: def insert_html_meta(self, cur: psycopg2.extensions.cursor, rows: List[Tuple[Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: """ batch elements are expected to have .to_sql_tuple() method """ @@ -309,7 +309,7 @@ class SandcrawlerPostgresClient: def insert_pdftrio(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO pdftrio (sha1hex, updated, status_code, status, pdftrio_version, @@ -358,7 +358,7 @@ class SandcrawlerPostgresClient: def insert_ingest_request(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request) @@ -398,7 +398,7 @@ class SandcrawlerPostgresClient: def insert_ingest_file_result(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex) @@ -441,7 +441,7 @@ class SandcrawlerPostgresClient: def insert_ingest_fileset_platform(self, cur: psycopg2.extensions.cursor, batch: List[Dict[str, Any]], - on_conflict: str = "nothing"): + on_conflict: str = "nothing") -> Tuple[int, int]: sql = """ INSERT INTO ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest) diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index c97e639..6d66d81 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -4,7 +4,8 @@ from typing import Optional, Tuple import internetarchive import requests -from sandcrawler.fileset_types import * +from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy, + PlatformRestrictedError, PlatformScopeError) from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult @@ -262,7 +263,7 @@ class DataverseHelper(FilesetPlatformHelper): ) -def test_parse_dataverse_persistentid(): +def test_parse_dataverse_persistentid() -> None: valid = { "doi:10.25625/LL6WXZ": { @@ -465,7 +466,7 @@ class FigshareHelper(FilesetPlatformHelper): ) -def test_parse_figshare_url_path(): +def test_parse_figshare_url_path() -> None: valid = { "/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1": diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 15a9f2b..ab0fd61 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,7 +1,7 @@ import datetime import sys import urllib.parse -from typing import Any, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import braveblock import dateparser @@ -20,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal # order of these are mostly by preference/quality (best option first), though # also/sometimes re-ordered for lookup efficiency (lookup stops after first # match) -HEAD_META_PATTERNS: Any = { +HEAD_META_PATTERNS: Dict[str, List[str]] = { "title": [ "meta[name='citation_title']", "meta[name='eprints.title']", @@ -151,7 +151,7 @@ HEAD_META_PATTERNS: Any = { ], } -HEAD_META_LIST_PATTERNS: Any = { +HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = { "contrib_names": [ "meta[name='citation_author']", "meta[name='bepress_citation_author']", @@ -170,7 +170,7 @@ HEAD_META_LIST_PATTERNS: Any = { ], } -XML_FULLTEXT_PATTERNS: List[dict] = [ +XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "meta[name='citation_xml_url']", "attr": "content", @@ -222,7 +222,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ }, ] -HTML_FULLTEXT_PATTERNS: List[dict] = [ +HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "meta[name='citation_fulltext_html_url']", "attr": "content", @@ -249,7 +249,7 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ }, ] -COMPONENT_FULLTEXT_PATTERNS: List[dict] = [ +COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "in_doc_url": "pensoft.net/article/", # also /element/ "in_fulltext_url": "/download/fig/", @@ -262,7 +262,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [ "in_doc_url": "/file.xhtml?persistentId", "in_fulltext_url": "/access/datafile/", "selector": "div.form-group code", - "use_body": True, + "use_body": "true", "technique": "Dataverse 'download URL'", "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0", }, @@ -270,7 +270,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [ # This is a database of matching patterns. Most of these discovered by hand, # looking at OA journal content that failed to craw/ingest. -PDF_FULLTEXT_PATTERNS: List[dict] = [ +PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "head meta[name='citation_pdf_url']", "attr": "content", @@ -591,14 +591,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ }, ] -FULLTEXT_URL_PATTERNS_SKIP = [ +FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ # wiley has a weird almost-blank page we don't want to loop on "://onlinelibrary.wiley.com/doi/pdf/" "://doi.org/" "://dx.doi.org/" ] -RELEASE_TYPE_MAP = { +RELEASE_TYPE_MAP: Dict[str, str] = { "research article": "article-journal", "text.serial.journal": "article-journal", } @@ -807,7 +807,8 @@ def load_adblock_rules() -> braveblock.Adblocker: ) -def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list: +def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], + type_name: str) -> List[Dict[str, str]]: resources = [] for node in doc.css(selector): @@ -831,7 +832,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name def html_extract_resources(doc_url: str, doc: HTMLParser, - adblock: braveblock.Adblocker) -> list: + adblock: braveblock.Adblocker) -> List[Dict[str, str]]: """ This function tries to find all the important resources in a page. The presumption is that the HTML document is article fulltext, and we want the diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 04a1e3b..aa4752e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -314,7 +314,7 @@ class CdxApiClient: if not rows: return None - def _cdx_sort_key(r): + def _cdx_sort_key(r: CdxRow) -> tuple: """ This is a function, not a lambda, because it captures best_mimetype. Will create a tuple that can be used to sort in @@ -901,7 +901,7 @@ class SavePageNowClient: def save_url_now_v2(self, request_url: str, force_simple_get: Optional[int] = None, - capture_outlinks: int = 0): + capture_outlinks: int = 0) -> SavePageNowResult: """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index bc8643b..281c6d3 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -364,6 +364,8 @@ class IngestFileWorker(SandcrawlerWorker): # Need to actually processes result = process_pdf(resource.body) + assert result.sha1hex == file_meta['sha1hex'] + assert result.file_meta is not None assert result.file_meta['sha1hex'] == file_meta['sha1hex'] if self.thumbnail_sink and result.page0_thumbnail is not None: self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex) diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index 046db9e..1967ba3 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -1,11 +1,16 @@ import hashlib import io +from typing import Optional, Tuple, Union import minio class SandcrawlerMinioClient(object): - def __init__(self, host_url, access_key, secret_key, default_bucket=None): + def __init__(self, + host_url: str, + access_key: str, + secret_key: str, + default_bucket: Optional[str] = None): """ host is minio connection string (host:port) access and secret key are as expected @@ -25,7 +30,7 @@ class SandcrawlerMinioClient(object): ) self.default_bucket = default_bucket - def _blob_path(self, folder, sha1hex: str, extension: str, prefix): + def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str: if not extension: extension = "" if not prefix: @@ -41,7 +46,13 @@ class SandcrawlerMinioClient(object): ) return obj_path - def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None): + def put_blob(self, + folder: str, + blob: Union[str, bytes], + sha1hex: Optional[str] = None, + extension: str = "", + prefix: str = "", + bucket: Optional[str] = None) -> Tuple[str, str]: """ blob should be bytes sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated @@ -78,7 +89,12 @@ class SandcrawlerMinioClient(object): ) return (bucket, obj_path) - def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None): + def get_blob(self, + folder: str, + sha1hex: str, + extension: str = "", + prefix: str = "", + bucket: str = None) -> bytes: """ sha1hex is sha1 of the blob itself diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 5ca7a4b..83a4626 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -2,7 +2,7 @@ import base64 import datetime import hashlib import os -from typing import Optional +from typing import List, Optional import magic import requests @@ -166,7 +166,7 @@ def normalize_mime(raw: str) -> Optional[str]: return None -def test_normalize_mime(): +def test_normalize_mime() -> None: assert normalize_mime("asdf") is None assert normalize_mime("application/pdf") == "application/pdf" assert normalize_mime("application/pdf+journal") == "application/pdf" @@ -179,7 +179,7 @@ def test_normalize_mime(): assert normalize_mime("binary/octet-stream") == "application/octet-stream" -def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]: +def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]: """ This method always filters a few things out: @@ -241,7 +241,7 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]: def test_parse_cdx_datetime() -> None: assert parse_cdx_datetime("") is None assert parse_cdx_datetime("asdf") is None - assert parse_cdx_datetime("19930203123045") != None + assert parse_cdx_datetime("19930203123045") is not None assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020, month=10, day=28, @@ -266,10 +266,10 @@ def test_datetime_to_cdx() -> None: datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3)) -def requests_retry_session(retries=10, - backoff_factor=3, - status_forcelist=(500, 502, 504), - session=None) -> requests.Session: +def requests_retry_session(retries: int = 10, + backoff_factor: int = 3, + status_forcelist: List[int] = [500, 502, 504], + session: requests.Session = None) -> requests.Session: """ From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests """ diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index ba875cd..7b18367 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -1,17 +1,19 @@ import time +from typing import Any, Dict, Optional import requests +from .ia import WaybackClient from .misc import gen_file_metadata, requests_retry_session from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class PdfTrioClient(object): - def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs): + def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) - def classify_pdf(self, blob, mode="auto"): + def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]: """ Returns a dict with at least: @@ -24,7 +26,7 @@ class PdfTrioClient(object): appropriately; an optional `error_msg` may also be set. For some other errors, like connection failure, an exception is raised. """ - assert blob + assert blob and type(blob) == bytes try: pdftrio_response = requests.post( @@ -68,12 +70,16 @@ class PdfTrioWorker(SandcrawlerFetchWorker): """ This class is basically copied directly from GrobidWorker """ - def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs): - super().__init__(wayback_client=wayback_client) + def __init__(self, + pdftrio_client: PdfTrioClient, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs): + super().__init__(wayback_client=wayback_client, **kwargs) self.pdftrio_client = pdftrio_client self.sink = sink - def process(self, record, key=None): + def process(self, record: Any, key: str = None) -> Any: start_process = time.time() fetch_sec = None @@ -103,16 +109,21 @@ class PdfTrioBlobWorker(SandcrawlerWorker): This is sort of like PdfTrioWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs): - super().__init__() + def __init__(self, + pdftrio_client: PdfTrioClient, + sink: Optional[SandcrawlerWorker] = None, + mode: str = "auto", + **kwargs): + super().__init__(**kwargs) self.pdftrio_client = pdftrio_client self.sink = sink self.mode = mode - def process(self, blob, key=None): + def process(self, blob: Any, key: str = None) -> Any: start_process = time.time() if not blob: return None + assert isinstance(blob, bytes) result = dict() result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index f2318c2..617f2b4 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -105,7 +105,7 @@ def test_ingest_success(ingest_worker_pdf): assert 'fatcat_release' in resp['grobid'] assert 'grobid_version' not in resp['grobid']['metadata'] assert 'fatcat_release' not in resp['grobid']['metadata'] - assert not 'tei_xml' in resp['grobid'] + assert 'tei_xml' not in resp['grobid'] assert resp['pdf_meta']['status'] == "success" assert resp['pdf_meta']['pdf_extra']['page_count'] == 1 assert resp['pdf_meta'].get('text') is None diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 7d3e755..5830dc9 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -87,7 +87,7 @@ def test_invalid_cdx(): print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" - assert parse_cdx_line(raw) == None + assert parse_cdx_line(raw) is None def test_clean_url(): |