From 648f04bfdcf441ce4a396d09bdd0443b2a2ca51e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 14 Jan 2020 15:30:42 -0800 Subject: basic FTP ingest support; revist record resolution - supporting revisits means more wayback hits (fewer crawls) => faster - ... but this is only partial support. will also need to work through sandcrawler db schema, etc. current status should be safe to merge/use. - ftp support via treating an ftp hit as a 200 --- python/sandcrawler/ia.py | 111 ++++++++++++++++++++++++++++++------------- python/sandcrawler/ingest.py | 2 +- 2 files changed, 78 insertions(+), 35 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 4b4875d..02e71be 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -115,12 +115,21 @@ class CdxApiClient: for raw in rj[1:]: assert len(raw) == 11 # JSON is short #print(raw, file=sys.stderr) + + # transform "-" ftp status code to a 226 + status_code = None + if raw[4] == "-": + if raw[2].startswith("ftp://"): + status_code = 226 + else: + status_code = int(raw[4]) + row = CdxRow( surt=raw[0], datetime=raw[1], url=raw[2], mimetype=raw[3], - status_code=int(raw[4]), + status_code=status_code, sha1b32=raw[5], sha1hex=b32_hex(raw[5]), warc_csize=int(raw[8]), @@ -171,6 +180,23 @@ class CdxApiClient: Fetches multiple CDX rows for the given URL, tries to find the most recent. If no matching row is found, return None. Note this is different from fetch. + + Preference order by status code looks like: + + 200 or 226 + mimetype match + not-liveweb + most-recent + no match + not-liveweb + most-recent + 3xx + most-recent + 4xx + most-recent + 5xx + most-recent + """ params = { 'url': url, @@ -180,7 +206,9 @@ class CdxApiClient: # Collapsing seems efficient, but is complex; would need to include # other filters and status code in filter #'collapse': 'timestamp:6', - 'filter': '!mimetype:warc/revisit', + + # Revisits now allowed and resolved! + #'filter': '!mimetype:warc/revisit', } if max_age_days: since = datetime.date.today() - datetime.timedelta(days=max_age_days) @@ -189,35 +217,22 @@ class CdxApiClient: if not rows: return None - def cdx_sort_key(r): + def _cdx_sort_key(r): """ - Preference order by status code looks like: - - 200 - mimetype match - not-liveweb - most-recent - no match - not-liveweb - most-recent - 3xx - most-recent - 4xx - most-recent - 5xx - most-recent - - This function will create a tuple that can be used to sort in *reverse* order. + This is a function, not a lambda, because it captures + best_mimetype. Will create a tuple that can be used to sort in + *reverse* order. """ return ( - int(r.status_code == 200), + int(r.status_code in (200, 226)), int(0 - r.status_code), int(r.mimetype == best_mimetype), + int(r.mimetype != "warc/revisit"), int('/' in r.warc_path), int(r.datetime), ) - rows = sorted(rows, key=cdx_sort_key) + rows = sorted(rows, key=_cdx_sort_key) return rows[-1] @@ -247,7 +262,7 @@ class WaybackClient: self.max_redirects = 25 self.wayback_endpoint = "https://web.archive.org/web/" - def fetch_petabox(self, csize, offset, warc_path): + def fetch_petabox(self, csize, offset, warc_path, resolve_revisit=True): """ Fetches wayback resource directly from petabox using WARC path/offset/csize. @@ -262,6 +277,10 @@ class WaybackClient: - location: eg, for redirects - body: raw bytes + resolve_revist does what it sounds like: tries following a revisit + record by looking up CDX API and then another fetch. Refuses to recurse + more than one hop (eg, won't follow a chain of revisits). + Requires (and uses) a secret token. """ if not self.petabox_webdata_secret: @@ -292,20 +311,40 @@ class WaybackClient: status_code = gwb_record.get_status()[0] location = gwb_record.get_location() or None + if status_code is None and gwb_record.target_uri.startswith(b"ftp://"): + # TODO: some additional verification here? + status_code = 226 + body = None - if status_code == 200: - try: - body = gwb_record.open_raw_content().read() - except IncompleteRead as ire: - raise WaybackError( - "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) + if status_code in (200, 226): + if gwb_record.is_revisit(): + if not resolve_revisit: + raise WaybackError( "found revisit record, but won't resolve (loop?)") + revisit_uri, revisit_dt = gwb_record.refers_to + # convert revisit_dt + assert len(revisit_dt) == len("2018-07-24T11:56:49") + revisit_uri = revisit_uri.decode('utf-8') + revisit_dt = revisit_dt.decode('utf-8').replace('-', '').replace(':', '').replace('T', '') + revisit_cdx = self.cdx_client.fetch(revisit_uri, revisit_dt) + body = self.fetch_petabox_body( + csize=revisit_cdx.warc_csize, + offset=revisit_cdx.warc_offset, + warc_path=revisit_cdx.warc_path, + resolve_revisit=False, + ) + else: + try: + body = gwb_record.open_raw_content().read() + except IncompleteRead as ire: + raise WaybackError( + "failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire)) return WarcResource( status_code=status_code, location=location, body=body, ) - def fetch_petabox_body(self, csize, offset, warc_path): + def fetch_petabox_body(self, csize, offset, warc_path, resolve_revisit=True): """ Fetches HTTP 200 WARC resource directly from petabox using WARC path/offset/csize. @@ -317,11 +356,12 @@ class WaybackClient: csize=csize, offset=offset, warc_path=warc_path, + resolve_revisit=resolve_revisit, ) - if resource.status_code != 200: + if resource.status_code not in (200, 226): raise KeyError("archived HTTP response (WARC) was not 200: {}".format( - gwb_record.get_status()[0]), + resource.status_code) ) return resource.body @@ -463,7 +503,7 @@ class WaybackClient: body=None, cdx=None, ) - if cdx_row.status_code == 200: + if cdx_row.status_code in (200, 226): if '/' in cdx_row.warc_path: body = self.fetch_petabox_body( csize=cdx_row.warc_csize, @@ -724,10 +764,13 @@ class SavePageNowClient: if not cdx_row: # lookup exact try: + filter_status_code = 200 + if spn_result.terminal_url.startswith("ftp://"): + filter_status_code = 226 cdx_row = wayback_client.cdx_client.fetch( url=spn_result.terminal_url, datetime=spn_result.terminal_dt, - filter_status_code=200, + filter_status_code=filter_status_code, retry_sleep=10.0, ) except KeyError as ke: diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 8c77d65..11b8a4c 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -242,7 +242,7 @@ class IngestFileWorker(SandcrawlerWorker): # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource.hit == True - assert resource.terminal_status_code == 200 + assert resource.terminal_status_code in (200, 226) result['file_meta'] = file_meta result['cdx'] = cdx_to_dict(resource.cdx) -- cgit v1.2.3