diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 102 |
1 files changed, 65 insertions, 37 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 9d990bf..caa6e79 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -12,7 +12,7 @@ import time import urllib.parse from collections import namedtuple from http.client import IncompleteRead -from typing import Tuple +from typing import Any, Dict, List, Optional, Tuple, Union import requests import urllib3.exceptions @@ -80,19 +80,19 @@ CdxPartial = namedtuple('CdxPartial', [ ]) -def cdx_partial_from_row(full): +def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial: return CdxPartial( - surt=full.surt, - datetime=full.datetime, - url=full.url, - mimetype=full.mimetype, - status_code=full.status_code, - sha1b32=full.sha1b32, - sha1hex=full.sha1hex, + surt=row.surt, + datetime=row.datetime, + url=row.url, + mimetype=row.mimetype, + status_code=row.status_code, + sha1b32=row.sha1b32, + sha1hex=row.sha1hex, ) -def cdx_to_dict(cdx): +def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]: d = { "surt": cdx.surt, "datetime": cdx.datetime, @@ -109,7 +109,7 @@ def cdx_to_dict(cdx): return d -def fuzzy_match_url(left, right): +def fuzzy_match_url(left: str, right: str) -> bool: """ Matches URLs agnostic of http/https (and maybe other normalizations in the future) @@ -126,7 +126,7 @@ def fuzzy_match_url(left, right): return False -def test_fuzzy_match_url(): +def test_fuzzy_match_url() -> None: assert fuzzy_match_url("http://thing.com", "http://thing.com") is True assert fuzzy_match_url("http://thing.com", "https://thing.com") is True assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True @@ -146,7 +146,7 @@ class CdxApiError(Exception): class CdxApiClient: - def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs): + def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN')) @@ -158,7 +158,7 @@ class CdxApiClient: 'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token), }) - def _query_api(self, params): + def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]: """ Hits CDX API with a query, parses result into a list of CdxRow """ @@ -206,7 +206,11 @@ class CdxApiClient: rows.append(row) return rows - def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None): + def fetch(self, + url: str, + datetime: str, + filter_status_code: Optional[int] = None, + retry_sleep: Optional[int] = None) -> CdxRow: """ Fetches a single CDX row by url/datetime. Raises a KeyError if not found, because we expect to be looking up a specific full record. @@ -214,7 +218,7 @@ class CdxApiClient: if len(datetime) != 14: raise ValueError( "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) - params = { + params: Dict[str, str] = { 'url': url, 'from': datetime, 'to': datetime, @@ -257,7 +261,11 @@ class CdxApiClient: assert row.status_code == filter_status_code return row - def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None): + def lookup_best(self, + url: str, + max_age_days: Optional[int] = None, + best_mimetype: Optional[str] = None, + closest: Union[datetime.datetime, str, None] = None) -> Optional[CdxRow]: """ Fetches multiple CDX rows for the given URL, tries to find the most recent. @@ -280,7 +288,7 @@ class CdxApiClient: most-recent """ - params = { + params: Dict[str, str] = { 'url': url, 'matchType': 'exact', 'limit': -25, @@ -340,7 +348,7 @@ class NoCaptureError(Exception): class WaybackClient: - def __init__(self, cdx_client=None, **kwargs): + def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs): if cdx_client: self.cdx_client = cdx_client else: @@ -361,7 +369,11 @@ class WaybackClient: 'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient', } - def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True): + def fetch_petabox(self, + csize: int, + offset: int, + warc_path: str, + resolve_revisit: bool = True) -> WarcResource: """ Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -391,6 +403,7 @@ class WaybackClient: if not self.rstore: self.rstore = ResourceStore( loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, )) + assert self.rstore try: #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) @@ -487,11 +500,11 @@ class WaybackClient: ) def fetch_petabox_body(self, - csize, - offset, - warc_path, - resolve_revisit=True, - expected_status_code=None): + csize: int, + offset: int, + warc_path: str, + resolve_revisit: bool = True, + expected_status_code: Optional[int] = None) -> WarcResource: """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -518,7 +531,10 @@ class WaybackClient: return resource.body - def fetch_replay_body(self, url, datetime, cdx_sha1hex=None): + def fetch_replay_body(self, + url: str, + datetime: str, + cdx_sha1hex: Optional[str] = None) -> bytes: """ Fetches an HTTP 200 record from wayback via the replay interface (web.archive.org) instead of petabox. @@ -579,7 +595,7 @@ class WaybackClient: cdx_sha1hex, file_meta['sha1hex']), ) return resp.content - def fetch_replay_redirect(self, url, datetime): + def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]: """ Fetches an HTTP 3xx redirect Location from wayback via the replay interface (web.archive.org) instead of petabox. @@ -633,7 +649,10 @@ class WaybackClient: else: return None - def lookup_resource(self, start_url, best_mimetype=None, closest=None): + def lookup_resource(self, + start_url: str, + best_mimetype: Optional[str] = None, + closest: Union[str, datetime.datetime, None] = None) -> ResourceResult: """ Looks in wayback for a resource starting at the URL, following any redirects. Returns a ResourceResult object, which may indicate a @@ -701,6 +720,7 @@ class WaybackClient: if cdx_row.status_code in (200, 226): revisit_cdx = None + final_cdx: Union[CdxRow, CdxPartial] = cdx_row if '/' in cdx_row.warc_path: resource = self.fetch_petabox( csize=cdx_row.warc_csize, @@ -714,7 +734,7 @@ class WaybackClient: url=cdx_row.url, datetime=cdx_row.datetime, ) - cdx_row = cdx_partial_from_row(cdx_row) + final_cdx = cdx_partial_from_row(cdx_row) return ResourceResult( start_url=start_url, hit=True, @@ -723,7 +743,7 @@ class WaybackClient: terminal_dt=cdx_row.datetime, terminal_status_code=cdx_row.status_code, body=body, - cdx=cdx_row, + cdx=final_cdx, revisit_cdx=revisit_cdx, ) elif 300 <= (cdx_row.status_code or 0) < 400: @@ -801,6 +821,7 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) + return ResourceResult( start_url=start_url, hit=False, @@ -834,7 +855,7 @@ SavePageNowResult = namedtuple('SavePageNowResult', [ class SavePageNowClient: - def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs): + def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs): self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY')) self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY')) self.v2endpoint = v2endpoint @@ -872,7 +893,10 @@ class SavePageNowClient: "://s3-eu-west-1.amazonaws.com/", ] - def save_url_now_v2(self, request_url, force_simple_get=None, capture_outlinks=0): + def save_url_now_v2(self, + request_url: str, + force_simple_get: Optional[int] = None, + capture_outlinks: int = 0): """ Returns a "SavePageNowResult" (namedtuple) if SPN request was processed at all, or raises an exception if there was an error with SPN itself. @@ -1006,7 +1030,10 @@ class SavePageNowClient: None, ) - def crawl_resource(self, start_url, wayback_client, force_simple_get=None): + def crawl_resource(self, + start_url: str, + wayback_client: WaybackClient, + force_simple_get: Optional[int] = None) -> ResourceResult: """ Runs a SPN2 crawl, then fetches body. @@ -1083,7 +1110,7 @@ class SavePageNowClient: revisit_cdx=None, ) - cdx_row = None + cdx_row: Optional[CdxRow] = None # hack to work around elsevier weirdness if "://pdf.sciencedirectassets.com/" in spn_result.request_url: elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best( @@ -1135,6 +1162,7 @@ class SavePageNowClient: #print(cdx_row, file=sys.stderr) revisit_cdx = None + final_cdx: Union[CdxRow, CdxPartial] = cdx_row if '/' in cdx_row.warc_path: # Usually can't do this kind of direct fetch because CDX result is recent/live resource = wayback_client.fetch_petabox( @@ -1166,7 +1194,7 @@ class SavePageNowClient: revisit_cdx=None, ) # warc_path etc will change, so strip them out - cdx_row = cdx_partial_from_row(cdx_row) + final_cdx = cdx_partial_from_row(cdx_row) assert cdx_row.status_code if cdx_row.status_code in (200, 226): @@ -1178,7 +1206,7 @@ class SavePageNowClient: terminal_dt=cdx_row.datetime, terminal_status_code=cdx_row.status_code, body=body, - cdx=cdx_row, + cdx=final_cdx, revisit_cdx=revisit_cdx, ) else: @@ -1190,7 +1218,7 @@ class SavePageNowClient: terminal_dt=cdx_row.datetime, terminal_status_code=cdx_row.status_code, body=body, - cdx=cdx_row, + cdx=final_cdx, revisit_cdx=revisit_cdx, ) |