From 6e1b28166db996492736d22cfeba564156ce74fe Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 7 Jan 2020 16:52:35 -0800 Subject: remove SPNv1 code paths --- python/sandcrawler/ia.py | 36 +---------------------------- python/sandcrawler/ingest.py | 54 ++++++++++++++++++++------------------------ 2 files changed, 25 insertions(+), 65 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e945474..f4e4aae 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -23,7 +23,7 @@ class CdxApiClient: self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) self.http_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient', + 'User-Agent': 'Mozilla/5.0 sandcrawler.CdxApiClient', }) self.wayback_endpoint = "https://web.archive.org/web/" @@ -150,12 +150,7 @@ class SavePageNowClient: self.cdx_client = CdxApiClient() self.ia_access_key = os.environ.get('IA_ACCESS_KEY') self.ia_secret_key = os.environ.get('IA_SECRET_KEY') - self.v1endpoint = v1endpoint self.v2endpoint = v2endpoint - self.v1_session = requests_retry_session(retries=5, backoff_factor=3, status_forcelist=()) - self.v1_session.headers.update({ - 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient', - }) self.v2_session = requests_retry_session(retries=5, backoff_factor=3) self.v2_session.headers.update({ 'User-Agent': 'Mozilla/5.0 sandcrawler.SavePageNowClient', @@ -163,35 +158,6 @@ class SavePageNowClient: 'Authorization': 'LOW {}:{}'.format(self.ia_access_key, self.ia_secret_key), }) - def save_url_now_v1(self, url): - """ - Returns a tuple (cdx, blob) on success of single fetch, or raises an - error on non-success. - """ - try: - resp = self.v1_session.get(self.v1endpoint + url) - except requests.exceptions.RetryError as re: - # could have been any number of issues... - raise SavePageNowError(str(re)) - except requests.exceptions.TooManyRedirects as tmr: - raise SavePageNowRemoteError(str(tmr)) - - if resp.status_code != 200 and resp.headers.get('X-Archive-Wayback-Runtime-Error'): - # looks like a weird remote error; would not expect a CDX reply so bailing here - raise SavePageNowRemoteError(resp.headers['X-Archive-Wayback-Runtime-Error']) - if resp.status_code != 200 and not resp.headers.get('X-Archive-Orig-Location'): - # looks like an error which was *not* just a remote server HTTP - # status code, or one of the handled wayback runtime errors. Some - # of these are remote server errors that wayback doesn't detect? - raise SavePageNowError("HTTP status: {}, url: {}".format(resp.status_code, url)) - - terminal_url = '/'.join(resp.url.split('/')[5:]) - body = resp.content - cdx = self.cdx_client.lookup_latest(terminal_url) - if not cdx: - raise SavePageNowError("SPN was successful, but CDX lookup then failed. URL: {}".format(terminal_url)) - return (cdx, body) - def save_url_now_v2(self, url): """ Returns a list of URLs, or raises an error on non-success. diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 077469a..58d3517 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -6,7 +6,7 @@ import requests from http.server import BaseHTTPRequestHandler, HTTPServer from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, SavePageNowRemoteError, CdxApiError -from sandcrawler.grobid import GrobidClient +krom sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker @@ -45,35 +45,29 @@ class IngestFileWorker(SandcrawlerWorker): cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) if not cdx: - # TODO: refactor this to make adding new domains/patterns easier - # sciencedirect.com (Elsevier) requires browser crawling (SPNv2) - if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)) or ('hrmars.com' in url): - #print(url) - cdx_list = self.spn_client.save_url_now_v2(url) - for cdx_url in cdx_list: - if 'pdf.sciencedirectassets.com' in cdx_url and '.pdf' in cdx_url: - cdx = self.cdx_client.lookup_latest(cdx_url) - break - if 'osapublishing.org' in cdx_url and 'abstract.cfm' in cdx_url: - cdx = self.cdx_client.lookup_latest(cdx_url) - break - if 'pubs.acs.org' in cdx_url and '/doi/pdf/' in cdx_url: - cdx = self.cdx_client.lookup_latest(cdx_url) - break - if 'ieeexplore.ieee.org' in cdx_url and '.pdf' in cdx_url and 'arnumber=' in cdx_url: - cdx = self.cdx_client.lookup_latest(cdx_url) - break - if 'hrmars.com' in cdx_url and 'journals/papers' in cdx_url: - cdx = self.cdx_client.lookup_latest(cdx_url) - break - if not cdx: - # extraction didn't work as expected; fetch whatever SPN2 got - cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) - if not cdx: - print("{}".format(cdx_list), file=sys.stderr) - raise SavePageNowError("Failed to find terminal capture from SPNv2") - else: - return self.spn_client.save_url_now_v1(url) + cdx_list = self.spn_client.save_url_now_v2(url) + for cdx_url in cdx_list: + if 'pdf.sciencedirectassets.com' in cdx_url and '.pdf' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if 'osapublishing.org' in cdx_url and 'abstract.cfm' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if 'pubs.acs.org' in cdx_url and '/doi/pdf/' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if 'ieeexplore.ieee.org' in cdx_url and '.pdf' in cdx_url and 'arnumber=' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if 'hrmars.com' in cdx_url and 'journals/papers' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if not cdx: + # extraction didn't work as expected; fetch whatever SPN2 got + cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) + if not cdx: + print("{}".format(cdx_list), file=sys.stderr) + raise SavePageNowError("Failed to find terminal capture from SPNv2") try: resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url']) -- cgit v1.2.3