aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/ia_pdf_match.py9
-rw-r--r--python/sandcrawler/db.py22
-rw-r--r--python/sandcrawler/fileset_platforms.py7
-rw-r--r--python/sandcrawler/html_metadata.py25
-rw-r--r--python/sandcrawler/ia.py4
-rw-r--r--python/sandcrawler/ingest_file.py2
-rw-r--r--python/sandcrawler/minio.py24
-rw-r--r--python/sandcrawler/misc.py16
-rw-r--r--python/sandcrawler/pdftrio.py29
-rw-r--r--python/tests/test_ingest.py2
-rw-r--r--python/tests/test_misc.py2
11 files changed, 87 insertions, 55 deletions
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index c3d9c16..ac17003 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -23,9 +23,10 @@ When invoking import matched, be sure to:
import json
import sys
+from typing import Any, Dict, Optional
-def parse(obj):
+def parse(obj: dict) -> Optional[Dict[str, Any]]:
if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
print('skip: test item', file=sys.stderr)
return None
@@ -42,7 +43,7 @@ def parse(obj):
extid = extid.replace('http://arxiv.org/abs/', '')
#print(extid)
assert '/' in extid or '.' in extid
- if not 'v' in extid or not extid[-1].isdigit():
+ if 'v' not in extid or not extid[-1].isdigit():
print('skip: non-versioned arxiv_id', file=sys.stderr)
return None
elif obj['metadata']['identifier'].startswith('paper-doi-10_'):
@@ -97,13 +98,13 @@ def parse(obj):
return match
-def run():
+def run() -> None:
for line in sys.stdin:
if not line:
continue
obj = json.loads(line)
match = parse(obj)
- if match:
+ if match is not None:
print(json.dumps(match, sort_keys=True))
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ca2657..fed1024 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -99,7 +99,7 @@ class SandcrawlerPostgrestClient:
class SandcrawlerPostgresClient:
- def __init__(self, db_url, **kwargs):
+ def __init__(self, db_url: str, **kwargs):
self.conn = psycopg2.connect(db_url)
def cursor(self) -> psycopg2.extensions.cursor:
@@ -108,7 +108,7 @@ class SandcrawlerPostgresClient:
def commit(self) -> None:
self.conn.commit()
- def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str):
+ def _inserts_and_updates(self, resp: List[Tuple[Any]], on_conflict: str) -> Tuple[int, int]:
resp_codes = [int(r[0]) for r in resp]
inserts = len([r for r in resp_codes if r == 0])
if on_conflict == "update":
@@ -120,7 +120,7 @@ class SandcrawlerPostgresClient:
def insert_cdx(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
cdx (url, datetime, sha1hex, mimetype, warc_path, warc_csize, warc_offset)
@@ -149,7 +149,7 @@ class SandcrawlerPostgresClient:
def insert_file_meta(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
file_meta(sha1hex, sha256hex, md5hex, size_bytes, mimetype)
@@ -181,7 +181,7 @@ class SandcrawlerPostgresClient:
def insert_grobid(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
grobid (sha1hex, grobid_version, status_code, status, fatcat_release, updated, metadata)
@@ -232,7 +232,7 @@ class SandcrawlerPostgresClient:
def insert_pdf_meta(self,
cur: psycopg2.extensions.cursor,
rows: List[Tuple[Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -272,7 +272,7 @@ class SandcrawlerPostgresClient:
def insert_html_meta(self,
cur: psycopg2.extensions.cursor,
rows: List[Tuple[Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
"""
batch elements are expected to have .to_sql_tuple() method
"""
@@ -309,7 +309,7 @@ class SandcrawlerPostgresClient:
def insert_pdftrio(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
pdftrio (sha1hex, updated, status_code, status, pdftrio_version,
@@ -358,7 +358,7 @@ class SandcrawlerPostgresClient:
def insert_ingest_request(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_request (link_source, link_source_id, ingest_type, base_url, ingest_request_source, release_stage, request)
@@ -398,7 +398,7 @@ class SandcrawlerPostgresClient:
def insert_ingest_file_result(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_file_result (ingest_type, base_url, hit, status, terminal_url, terminal_dt, terminal_status_code, terminal_sha1hex)
@@ -441,7 +441,7 @@ class SandcrawlerPostgresClient:
def insert_ingest_fileset_platform(self,
cur: psycopg2.extensions.cursor,
batch: List[Dict[str, Any]],
- on_conflict: str = "nothing"):
+ on_conflict: str = "nothing") -> Tuple[int, int]:
sql = """
INSERT INTO
ingest_fileset_platform (ingest_type, base_url, hit, status, platform_name, platform_domain, platform_id, ingest_strategy, total_size, file_count, archiveorg_item_name, archiveorg_item_bundle_path, web_bundle_url, web_bundle_dt, manifest)
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index c97e639..6d66d81 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -4,7 +4,8 @@ from typing import Optional, Tuple
import internetarchive
import requests
-from sandcrawler.fileset_types import *
+from sandcrawler.fileset_types import (FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+ PlatformRestrictedError, PlatformScopeError)
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
@@ -262,7 +263,7 @@ class DataverseHelper(FilesetPlatformHelper):
)
-def test_parse_dataverse_persistentid():
+def test_parse_dataverse_persistentid() -> None:
valid = {
"doi:10.25625/LL6WXZ": {
@@ -465,7 +466,7 @@ class FigshareHelper(FilesetPlatformHelper):
)
-def test_parse_figshare_url_path():
+def test_parse_figshare_url_path() -> None:
valid = {
"/articles/Optimized_protocol_to_isolate_high_quality_genomic_DNA_from_different_tissues_of_a_palm_species/8987858/1":
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 15a9f2b..ab0fd61 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,7 +1,7 @@
import datetime
import sys
import urllib.parse
-from typing import Any, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
import braveblock
import dateparser
@@ -20,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal
# order of these are mostly by preference/quality (best option first), though
# also/sometimes re-ordered for lookup efficiency (lookup stops after first
# match)
-HEAD_META_PATTERNS: Any = {
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
"title": [
"meta[name='citation_title']",
"meta[name='eprints.title']",
@@ -151,7 +151,7 @@ HEAD_META_PATTERNS: Any = {
],
}
-HEAD_META_LIST_PATTERNS: Any = {
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
"contrib_names": [
"meta[name='citation_author']",
"meta[name='bepress_citation_author']",
@@ -170,7 +170,7 @@ HEAD_META_LIST_PATTERNS: Any = {
],
}
-XML_FULLTEXT_PATTERNS: List[dict] = [
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_xml_url']",
"attr": "content",
@@ -222,7 +222,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
},
]
-HTML_FULLTEXT_PATTERNS: List[dict] = [
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "meta[name='citation_fulltext_html_url']",
"attr": "content",
@@ -249,7 +249,7 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
},
]
-COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"in_doc_url": "pensoft.net/article/", # also /element/
"in_fulltext_url": "/download/fig/",
@@ -262,7 +262,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
"in_doc_url": "/file.xhtml?persistentId",
"in_fulltext_url": "/access/datafile/",
"selector": "div.form-group code",
- "use_body": True,
+ "use_body": "true",
"technique": "Dataverse 'download URL'",
"example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
},
@@ -270,7 +270,7 @@ COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
# This is a database of matching patterns. Most of these discovered by hand,
# looking at OA journal content that failed to craw/ingest.
-PDF_FULLTEXT_PATTERNS: List[dict] = [
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
{
"selector": "head meta[name='citation_pdf_url']",
"attr": "content",
@@ -591,14 +591,14 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
},
]
-FULLTEXT_URL_PATTERNS_SKIP = [
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
# wiley has a weird almost-blank page we don't want to loop on
"://onlinelibrary.wiley.com/doi/pdf/"
"://doi.org/"
"://dx.doi.org/"
]
-RELEASE_TYPE_MAP = {
+RELEASE_TYPE_MAP: Dict[str, str] = {
"research article": "article-journal",
"text.serial.journal": "article-journal",
}
@@ -807,7 +807,8 @@ def load_adblock_rules() -> braveblock.Adblocker:
)
-def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str],
+ type_name: str) -> List[Dict[str, str]]:
resources = []
for node in doc.css(selector):
@@ -831,7 +832,7 @@ def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name
def html_extract_resources(doc_url: str, doc: HTMLParser,
- adblock: braveblock.Adblocker) -> list:
+ adblock: braveblock.Adblocker) -> List[Dict[str, str]]:
"""
This function tries to find all the important resources in a page. The
presumption is that the HTML document is article fulltext, and we want the
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 04a1e3b..aa4752e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -314,7 +314,7 @@ class CdxApiClient:
if not rows:
return None
- def _cdx_sort_key(r):
+ def _cdx_sort_key(r: CdxRow) -> tuple:
"""
This is a function, not a lambda, because it captures
best_mimetype. Will create a tuple that can be used to sort in
@@ -901,7 +901,7 @@ class SavePageNowClient:
def save_url_now_v2(self,
request_url: str,
force_simple_get: Optional[int] = None,
- capture_outlinks: int = 0):
+ capture_outlinks: int = 0) -> SavePageNowResult:
"""
Returns a "SavePageNowResult" (namedtuple) if SPN request was processed
at all, or raises an exception if there was an error with SPN itself.
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index bc8643b..281c6d3 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -364,6 +364,8 @@ class IngestFileWorker(SandcrawlerWorker):
# Need to actually processes
result = process_pdf(resource.body)
+ assert result.sha1hex == file_meta['sha1hex']
+ assert result.file_meta is not None
assert result.file_meta['sha1hex'] == file_meta['sha1hex']
if self.thumbnail_sink and result.page0_thumbnail is not None:
self.thumbnail_sink.push_record(result.page0_thumbnail, key=result.sha1hex)
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index 046db9e..1967ba3 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,11 +1,16 @@
import hashlib
import io
+from typing import Optional, Tuple, Union
import minio
class SandcrawlerMinioClient(object):
- def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ def __init__(self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -25,7 +30,7 @@ class SandcrawlerMinioClient(object):
)
self.default_bucket = default_bucket
- def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
if not extension:
extension = ""
if not prefix:
@@ -41,7 +46,13 @@ class SandcrawlerMinioClient(object):
)
return obj_path
- def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def put_blob(self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None) -> Tuple[str, str]:
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -78,7 +89,12 @@ class SandcrawlerMinioClient(object):
)
return (bucket, obj_path)
- def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ def get_blob(self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: str = None) -> bytes:
"""
sha1hex is sha1 of the blob itself
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 5ca7a4b..83a4626 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -2,7 +2,7 @@ import base64
import datetime
import hashlib
import os
-from typing import Optional
+from typing import List, Optional
import magic
import requests
@@ -166,7 +166,7 @@ def normalize_mime(raw: str) -> Optional[str]:
return None
-def test_normalize_mime():
+def test_normalize_mime() -> None:
assert normalize_mime("asdf") is None
assert normalize_mime("application/pdf") == "application/pdf"
assert normalize_mime("application/pdf+journal") == "application/pdf"
@@ -179,7 +179,7 @@ def test_normalize_mime():
assert normalize_mime("binary/octet-stream") == "application/octet-stream"
-def parse_cdx_line(raw_cdx: str, normalize=True) -> Optional[dict]:
+def parse_cdx_line(raw_cdx: str, normalize: bool = True) -> Optional[dict]:
"""
This method always filters a few things out:
@@ -241,7 +241,7 @@ def parse_cdx_datetime(dt_str: str) -> Optional[datetime.datetime]:
def test_parse_cdx_datetime() -> None:
assert parse_cdx_datetime("") is None
assert parse_cdx_datetime("asdf") is None
- assert parse_cdx_datetime("19930203123045") != None
+ assert parse_cdx_datetime("19930203123045") is not None
assert parse_cdx_datetime("20201028235103") == datetime.datetime(year=2020,
month=10,
day=28,
@@ -266,10 +266,10 @@ def test_datetime_to_cdx() -> None:
datetime.datetime(year=2020, month=10, day=28, hour=23, minute=51, second=3))
-def requests_retry_session(retries=10,
- backoff_factor=3,
- status_forcelist=(500, 502, 504),
- session=None) -> requests.Session:
+def requests_retry_session(retries: int = 10,
+ backoff_factor: int = 3,
+ status_forcelist: List[int] = [500, 502, 504],
+ session: requests.Session = None) -> requests.Session:
"""
From: https://www.peterbe.com/plog/best-practice-with-retries-with-requests
"""
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index ba875cd..7b18367 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,17 +1,19 @@
import time
+from typing import Any, Dict, Optional
import requests
+from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
- def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- def classify_pdf(self, blob, mode="auto"):
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
"""
Returns a dict with at least:
@@ -24,7 +26,7 @@ class PdfTrioClient(object):
appropriately; an optional `error_msg` may also be set. For some other
errors, like connection failure, an exception is raised.
"""
- assert blob
+ assert blob and type(blob) == bytes
try:
pdftrio_response = requests.post(
@@ -68,12 +70,16 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
"""
This class is basically copied directly from GrobidWorker
"""
- def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
- super().__init__(wayback_client=wayback_client)
+ def __init__(self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs):
+ super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record, key=None):
+ def process(self, record: Any, key: str = None) -> Any:
start_process = time.time()
fetch_sec = None
@@ -103,16 +109,21 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
- super().__init__()
+ def __init__(self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs):
+ super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
self.mode = mode
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: str = None) -> Any:
start_process = time.time()
if not blob:
return None
+ assert isinstance(blob, bytes)
result = dict()
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index f2318c2..617f2b4 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -105,7 +105,7 @@ def test_ingest_success(ingest_worker_pdf):
assert 'fatcat_release' in resp['grobid']
assert 'grobid_version' not in resp['grobid']['metadata']
assert 'fatcat_release' not in resp['grobid']['metadata']
- assert not 'tei_xml' in resp['grobid']
+ assert 'tei_xml' not in resp['grobid']
assert resp['pdf_meta']['status'] == "success"
assert resp['pdf_meta']['pdf_extra']['page_count'] == 1
assert resp['pdf_meta'].get('text') is None
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 7d3e755..5830dc9 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -87,7 +87,7 @@ def test_invalid_cdx():
print("bad datetime")
raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
- assert parse_cdx_line(raw) == None
+ assert parse_cdx_line(raw) is None
def test_clean_url():