From 38e635105a658850399847aa23a5bd5325b0d616 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Jan 2020 15:37:42 -0800 Subject: lots of progress on wayback refactoring - too much to list - canonical flags to control crawling - cdx_to_dict helper --- python/sandcrawler/ingest.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'python/sandcrawler/ingest.py') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index f085fc5..4b6c587 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -6,7 +6,7 @@ import requests from http.server import BaseHTTPRequestHandler, HTTPServer from collections import namedtuple -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -54,7 +54,9 @@ class IngestFileWorker(SandcrawlerWorker): if not self.grobid_client: self.grobid_client = GrobidClient() - self.attempt_existing = False + self.try_existing_ingest = False + self.try_wayback = True + self.try_spn2 = True def check_existing_ingest(self, base_url): """ @@ -66,7 +68,7 @@ class IngestFileWorker(SandcrawlerWorker): Looks at existing ingest results and makes a decision based on, eg, status and timestamp. """ - if not self.attempt_existing: + if not self.try_existing_ingest: return None raise NotImplementedError @@ -78,10 +80,16 @@ class IngestFileWorker(SandcrawlerWorker): Looks in wayback for a resource starting at the URL, following any redirects. If a hit isn't found, try crawling with SPN. """ - resource = self.wayback_client.lookup_resource(url, best_mimetype) - if not resource or not resource.hit: + via = "none" + resource = None + if self.try_wayback: + via = "wayback" + resource = self.wayback_client.lookup_resource(url, best_mimetype) + if self.try_spn2 and (not resource or not resource.hit): + via = "spn2" resource = self.spn_client.crawl_resource(url, self.wayback_client) - print("[FETCH] {} url:{}".format( + print("[FETCH {}\t] {}\turl:{}".format( + via, resource.status, url), file=sys.stderr) @@ -146,11 +154,6 @@ class IngestFileWorker(SandcrawlerWorker): try: # first hop resource = self.find_resource(base_url, best_mimetype) - print("[{}] fetch status: {} url: {}".format( - resource.status, - ingest_type, - base_url), - file=sys.stderr) if not resource.hit: result['status'] = resource.status return result @@ -218,6 +221,7 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "success" result['hit'] = True + result['cdx'] = cdx_to_dict(resource.cdx) return result -- cgit v1.2.3