From 2866ba252389ac9f3c595e7e7b6c9b4f6cf64663 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 10 Jan 2020 16:07:52 -0800 Subject: more general ingest teaks and affordances --- python/sandcrawler/ia.py | 23 ++++++++++++++++++----- python/sandcrawler/ingest.py | 34 ++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 15 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 27bbc34..cbf901a 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -266,9 +266,8 @@ class WaybackClient: """ if not self.petabox_webdata_secret: raise Exception("WaybackClient needs petabox secret to do direct WARC fetches") - # TODO: - #if not "/" in warc_path: - # raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) + if not "/" in warc_path: + raise ValueError("what looks like a liveweb/SPN temporary warc path: {}".format(warc_path)) warc_uri = self.warc_uri_prefix + warc_path if not self.rstore: self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory( @@ -278,6 +277,7 @@ class WaybackClient: #print("offset: {} csize: {} uri: {}".format(offset, csize, warc_uri), file=sys.stderr) gwb_record = self.rstore.load_resource(warc_uri, offset, csize) except wayback.exception.ResourceUnavailable: + print("Failed to fetch from warc_path:{}".format(warc_path), file=sys.stderr) raise PetaboxError("failed to load file contents from wayback/petabox (ResourceUnavailable)") except ValueError as ve: raise PetaboxError("failed to load file contents from wayback/petabox (ValueError: {})".format(ve)) @@ -347,8 +347,10 @@ class WaybackClient: assert datetime.isdigit() try: - # TODO: don't follow redirects? - resp = requests.get(self.wayback_endpoint + datetime + "id_/" + url) + resp = requests.get( + self.wayback_endpoint + datetime + "id_/" + url, + allow_redirects=False, + ) except requests.exceptions.TooManyRedirects: raise WaybackError("redirect loop (wayback replay fetch)") try: @@ -543,6 +545,7 @@ class SavePageNowClient: data={ 'url': request_url, 'capture_all': 1, + 'capture_screenshot': 0, 'if_not_archived_within': '1d', }, ) @@ -619,6 +622,16 @@ class SavePageNowClient: spn_result = self.save_url_now_v2(start_url) if not spn_result.success: + status = spn_result.status + if status in ("error:invalid-url", "error:not-found", + "error:invalid-host-resolution", "error:gateway-timeout"): + status = status.replace("error:", "") + elif status == "error:no-access": + status = "forbidden" + elif status == "error:user-session-limit": + raise Exception("SPNv2 user-session-limit, need to backoff") + elif status.startswith("error:"): + status = "spn2-" + status return ResourceResult( start_url=start_url, hit=False, diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5dde205..f618f1b 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -85,13 +85,20 @@ class IngestFileWorker(SandcrawlerWorker): if self.try_wayback: via = "wayback" resource = self.wayback_client.lookup_resource(url, best_mimetype) - if self.try_spn2 and (not resource or not resource.hit): + + # check for "soft 404" conditions, where we should retry with live SPNv2 + # TODO: could refactor these into the resource fetch things themselves? + soft404 = False + if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): + soft404 = True + + if self.try_spn2 and (not resource or not resource.hit or soft404): via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) - print("[FETCH {}\t] {}\turl:{}".format( + print("[FETCH {}\t] {}\t{}".format( via, resource.status, - url), + resource.terminal_url or url), file=sys.stderr) return resource @@ -141,10 +148,14 @@ class IngestFileWorker(SandcrawlerWorker): reqeust['ingest_type'] = 'pdf' # for now, only pdf ingest is implemented + if not 'ingest_type' in request: + request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') base_url = request['base_url'] + print("[INGEST {}\t] {}".format(ingest_type, base_url), file=sys.stderr) + best_mimetype = None if ingest_type == "pdf": best_mimetype = "application/pdf" @@ -157,7 +168,7 @@ class IngestFileWorker(SandcrawlerWorker): next_url = base_url hops = [base_url] - self.max_hops = 4 + self.max_hops = 6 while len(hops) <= self.max_hops: @@ -166,7 +177,7 @@ class IngestFileWorker(SandcrawlerWorker): try: resource = self.find_resource(next_url, best_mimetype) except SavePageNowError as e: - result['status'] = 'spn-error' + result['status'] = 'spn2-error' result['error_message'] = str(e) return result except PetaboxError as e: @@ -187,16 +198,14 @@ class IngestFileWorker(SandcrawlerWorker): return result file_meta = gen_file_metadata(resource.body) - if "html" in file_meta['mimetype']: - - # got landing page or similar + if "html" in file_meta['mimetype'] or "xml" in file_meta['mimetype']: + # Got landing page or similar. Some XHTML detected as "application/xml" if resource.terminal_dt: result['terminal'] = { "terminal_url": resource.terminal_url, "terminal_dt": resource.terminal_dt, "terminal_status_code": resource.terminal_status_code, } - fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['html'] = fulltext_url @@ -205,7 +214,7 @@ class IngestFileWorker(SandcrawlerWorker): return result next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') assert next_url - print("\tnext hop extracted ({}): {}".format( + print("[EXTRACT\t] {}\t{}".format( fulltext_url.get('technique'), next_url, ), @@ -252,6 +261,11 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "success" result['hit'] = True + print("[SUCCESS\t] sha1:{} grobid:{}".format( + result.get('file_meta', {}).get('sha1hex'), + result.get('grobid', {}).get('status_code'), + ), + file=sys.stderr) return result -- cgit v1.2.3