diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ia.py | 102 | 
1 files changed, 65 insertions, 37 deletions
| diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 9d990bf..caa6e79 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -12,7 +12,7 @@ import time  import urllib.parse  from collections import namedtuple  from http.client import IncompleteRead -from typing import Tuple +from typing import Any, Dict, List, Optional, Tuple, Union  import requests  import urllib3.exceptions @@ -80,19 +80,19 @@ CdxPartial = namedtuple('CdxPartial', [  ]) -def cdx_partial_from_row(full): +def cdx_partial_from_row(row: Union[CdxRow, CdxPartial]) -> CdxPartial:      return CdxPartial( -        surt=full.surt, -        datetime=full.datetime, -        url=full.url, -        mimetype=full.mimetype, -        status_code=full.status_code, -        sha1b32=full.sha1b32, -        sha1hex=full.sha1hex, +        surt=row.surt, +        datetime=row.datetime, +        url=row.url, +        mimetype=row.mimetype, +        status_code=row.status_code, +        sha1b32=row.sha1b32, +        sha1hex=row.sha1hex,      ) -def cdx_to_dict(cdx): +def cdx_to_dict(cdx: Union[CdxRow, CdxPartial]) -> Dict[str, Any]:      d = {          "surt": cdx.surt,          "datetime": cdx.datetime, @@ -109,7 +109,7 @@ def cdx_to_dict(cdx):      return d -def fuzzy_match_url(left, right): +def fuzzy_match_url(left: str, right: str) -> bool:      """      Matches URLs agnostic of http/https (and maybe other normalizations in the      future) @@ -126,7 +126,7 @@ def fuzzy_match_url(left, right):      return False -def test_fuzzy_match_url(): +def test_fuzzy_match_url() -> None:      assert fuzzy_match_url("http://thing.com", "http://thing.com") is True      assert fuzzy_match_url("http://thing.com", "https://thing.com") is True      assert fuzzy_match_url("http://thing.com", "ftp://thing.com") is True @@ -146,7 +146,7 @@ class CdxApiError(Exception):  class CdxApiClient: -    def __init__(self, host_url="https://web.archive.org/cdx/search/cdx", **kwargs): +    def __init__(self, host_url: str = "https://web.archive.org/cdx/search/cdx", **kwargs):          self.host_url = host_url          self.http_session = requests_retry_session(retries=3, backoff_factor=3)          cdx_auth_token = kwargs.get('cdx_auth_token', os.environ.get('CDX_AUTH_TOKEN')) @@ -158,7 +158,7 @@ class CdxApiClient:              'Cookie': 'cdx_auth_token={}'.format(cdx_auth_token),          }) -    def _query_api(self, params): +    def _query_api(self, params: Dict[str, str]) -> Optional[List[CdxRow]]:          """          Hits CDX API with a query, parses result into a list of CdxRow          """ @@ -206,7 +206,11 @@ class CdxApiClient:              rows.append(row)          return rows -    def fetch(self, url, datetime, filter_status_code=None, retry_sleep=None): +    def fetch(self, +              url: str, +              datetime: str, +              filter_status_code: Optional[int] = None, +              retry_sleep: Optional[int] = None) -> CdxRow:          """          Fetches a single CDX row by url/datetime. Raises a KeyError if not          found, because we expect to be looking up a specific full record. @@ -214,7 +218,7 @@ class CdxApiClient:          if len(datetime) != 14:              raise ValueError(                  "CDX fetch requires full 14 digit timestamp. Got: {}".format(datetime)) -        params = { +        params: Dict[str, str] = {              'url': url,              'from': datetime,              'to': datetime, @@ -257,7 +261,11 @@ class CdxApiClient:              assert row.status_code == filter_status_code          return row -    def lookup_best(self, url, max_age_days=None, best_mimetype=None, closest=None): +    def lookup_best(self, +                    url: str, +                    max_age_days: Optional[int] = None, +                    best_mimetype: Optional[str] = None, +                    closest: Union[datetime.datetime, str, None] = None) -> Optional[CdxRow]:          """          Fetches multiple CDX rows for the given URL, tries to find the most recent. @@ -280,7 +288,7 @@ class CdxApiClient:                  most-recent          """ -        params = { +        params: Dict[str, str] = {              'url': url,              'matchType': 'exact',              'limit': -25, @@ -340,7 +348,7 @@ class NoCaptureError(Exception):  class WaybackClient: -    def __init__(self, cdx_client=None, **kwargs): +    def __init__(self, cdx_client: Optional[CdxApiClient] = None, **kwargs):          if cdx_client:              self.cdx_client = cdx_client          else: @@ -361,7 +369,11 @@ class WaybackClient:              'User-Agent': 'Mozilla/5.0 sandcrawler.WaybackClient',          } -    def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True): +    def fetch_petabox(self, +                      csize: int, +                      offset: int, +                      warc_path: str, +                      resolve_revisit: bool = True) -> WarcResource:          """          Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -391,6 +403,7 @@ class WaybackClient:          if not self.rstore:              self.rstore = ResourceStore(                  loaderfactory=CDXLoaderFactory3(webdata_secret=self.petabox_webdata_secret, )) +        assert self.rstore          try:              #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr)              gwb_record = self.rstore.load_resource(warc_uri, offset, csize) @@ -487,11 +500,11 @@ class WaybackClient:          )      def fetch_petabox_body(self, -                           csize, -                           offset, -                           warc_path, -                           resolve_revisit=True, -                           expected_status_code=None): +                           csize: int, +                           offset: int, +                           warc_path: str, +                           resolve_revisit: bool = True, +                           expected_status_code: Optional[int] = None) -> WarcResource:          """          Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -518,7 +531,10 @@ class WaybackClient:          return resource.body -    def fetch_replay_body(self, url, datetime, cdx_sha1hex=None): +    def fetch_replay_body(self, +                          url: str, +                          datetime: str, +                          cdx_sha1hex: Optional[str] = None) -> bytes:          """          Fetches an HTTP 200 record from wayback via the replay interface          (web.archive.org) instead of petabox. @@ -579,7 +595,7 @@ class WaybackClient:                          cdx_sha1hex, file_meta['sha1hex']), )          return resp.content -    def fetch_replay_redirect(self, url, datetime): +    def fetch_replay_redirect(self, url: str, datetime: str) -> Optional[str]:          """          Fetches an HTTP 3xx redirect Location from wayback via the replay interface          (web.archive.org) instead of petabox. @@ -633,7 +649,10 @@ class WaybackClient:          else:              return None -    def lookup_resource(self, start_url, best_mimetype=None, closest=None): +    def lookup_resource(self, +                        start_url: str, +                        best_mimetype: Optional[str] = None, +                        closest: Union[str, datetime.datetime, None] = None) -> ResourceResult:          """          Looks in wayback for a resource starting at the URL, following any          redirects. Returns a ResourceResult object, which may indicate a @@ -701,6 +720,7 @@ class WaybackClient:              if cdx_row.status_code in (200, 226):                  revisit_cdx = None +                final_cdx: Union[CdxRow, CdxPartial] = cdx_row                  if '/' in cdx_row.warc_path:                      resource = self.fetch_petabox(                          csize=cdx_row.warc_csize, @@ -714,7 +734,7 @@ class WaybackClient:                          url=cdx_row.url,                          datetime=cdx_row.datetime,                      ) -                    cdx_row = cdx_partial_from_row(cdx_row) +                    final_cdx = cdx_partial_from_row(cdx_row)                  return ResourceResult(                      start_url=start_url,                      hit=True, @@ -723,7 +743,7 @@ class WaybackClient:                      terminal_dt=cdx_row.datetime,                      terminal_status_code=cdx_row.status_code,                      body=body, -                    cdx=cdx_row, +                    cdx=final_cdx,                      revisit_cdx=revisit_cdx,                  )              elif 300 <= (cdx_row.status_code or 0) < 400: @@ -801,6 +821,7 @@ class WaybackClient:                      cdx=cdx_row,                      revisit_cdx=None,                  ) +          return ResourceResult(              start_url=start_url,              hit=False, @@ -834,7 +855,7 @@ SavePageNowResult = namedtuple('SavePageNowResult', [  class SavePageNowClient: -    def __init__(self, v2endpoint="https://web.archive.org/save", **kwargs): +    def __init__(self, v2endpoint: str = "https://web.archive.org/save", **kwargs):          self.ia_access_key = kwargs.get('ia_access_key', os.environ.get('IA_ACCESS_KEY'))          self.ia_secret_key = kwargs.get('ia_secret_key', os.environ.get('IA_SECRET_KEY'))          self.v2endpoint = v2endpoint @@ -872,7 +893,10 @@ class SavePageNowClient:              "://s3-eu-west-1.amazonaws.com/",          ] -    def save_url_now_v2(self, request_url, force_simple_get=None, capture_outlinks=0): +    def save_url_now_v2(self, +                        request_url: str, +                        force_simple_get: Optional[int] = None, +                        capture_outlinks: int = 0):          """          Returns a "SavePageNowResult" (namedtuple) if SPN request was processed          at all, or raises an exception if there was an error with SPN itself. @@ -1006,7 +1030,10 @@ class SavePageNowClient:                  None,              ) -    def crawl_resource(self, start_url, wayback_client, force_simple_get=None): +    def crawl_resource(self, +                       start_url: str, +                       wayback_client: WaybackClient, +                       force_simple_get: Optional[int] = None) -> ResourceResult:          """          Runs a SPN2 crawl, then fetches body. @@ -1083,7 +1110,7 @@ class SavePageNowClient:                  revisit_cdx=None,              ) -        cdx_row = None +        cdx_row: Optional[CdxRow] = None          # hack to work around elsevier weirdness          if "://pdf.sciencedirectassets.com/" in spn_result.request_url:              elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best( @@ -1135,6 +1162,7 @@ class SavePageNowClient:          #print(cdx_row, file=sys.stderr)          revisit_cdx = None +        final_cdx: Union[CdxRow, CdxPartial] = cdx_row          if '/' in cdx_row.warc_path:              # Usually can't do this kind of direct fetch because CDX result is recent/live              resource = wayback_client.fetch_petabox( @@ -1166,7 +1194,7 @@ class SavePageNowClient:                      revisit_cdx=None,                  )              # warc_path etc will change, so strip them out -            cdx_row = cdx_partial_from_row(cdx_row) +            final_cdx = cdx_partial_from_row(cdx_row)          assert cdx_row.status_code          if cdx_row.status_code in (200, 226): @@ -1178,7 +1206,7 @@ class SavePageNowClient:                  terminal_dt=cdx_row.datetime,                  terminal_status_code=cdx_row.status_code,                  body=body, -                cdx=cdx_row, +                cdx=final_cdx,                  revisit_cdx=revisit_cdx,              )          else: @@ -1190,7 +1218,7 @@ class SavePageNowClient:                  terminal_dt=cdx_row.datetime,                  terminal_status_code=cdx_row.status_code,                  body=body, -                cdx=cdx_row, +                cdx=final_cdx,                  revisit_cdx=revisit_cdx,              ) | 
