diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:03:02 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 16:03:02 -0800 |
commit | 0897e3a5714ff8549e8ab68f44f0c49ce3f0405d (patch) | |
tree | 93d359fc711a4bc3813e3020ef790e323c3516eb | |
parent | 9529cbb2660897ce3ffe3986f60eafbf3596495d (diff) | |
download | sandcrawler-0897e3a5714ff8549e8ab68f44f0c49ce3f0405d.tar.gz sandcrawler-0897e3a5714ff8549e8ab68f44f0c49ce3f0405d.zip |
more progress on file ingest
-rwxr-xr-x | python/ingest_file.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/html.py | 19 | ||||
-rw-r--r-- | python/sandcrawler/ia.py | 23 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 47 |
4 files changed, 75 insertions, 17 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py index 40eee4d..4fd44ca 100755 --- a/python/ingest_file.py +++ b/python/ingest_file.py @@ -20,9 +20,10 @@ def run_single_ingest(args): return result def run_requests(args): + ingester = FileIngester() for l in args.json_file: request = json.loads(l.strip()) - result = ingest_file(request) + result = ingester.ingest_file(request) print(json.dumps(result)) def run_api(args): diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 3191b66..858e02a 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -6,6 +6,7 @@ import urllib.parse from bs4 import BeautifulSoup RESEARCHSQUARE_REGEX = re.compile(r'"url":"(https://assets.researchsquare.com/files/.{1,50}/v\d+/Manuscript.pdf)"') +IEEEXPLORE_REGEX = re.compile(r'"pdfPath":"(/.*?\.pdf)"') def extract_fulltext_url(html_url, html_body): """ @@ -70,4 +71,22 @@ def extract_fulltext_url(html_url, html_body): url = urllib.parse.unquote(url) return dict(next_url=url) + # ieeexplore.ieee.org + # https://ieeexplore.ieee.org/document/8730316 + if '://ieeexplore.ieee.org/document/' in html_url: + # JSON in body with a field like: + # "pdfPath":"/iel7/6287639/8600701/08730316.pdf", + m = IEEEXPLORE_REGEX.search(html_body.decode('utf-8')) + if m: + url = m.group(1) + assert len(url) < 1024 + return dict(release_stage="published", pdf_url=host_prefix+url) + # https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=8730313 + if '://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber' in html_url: + # HTML iframe like: + # <iframe src="http://web.archive.org/web/20191026011528if_/https://ieeexplore.ieee.org/ielx7/6287639/8600701/08730313.pdf?tp=&arnumber=8730313&isnumber=8600701&ref=" frameborder="0"></iframe> + iframe = soup.find("iframe") + if iframe and '.pdf' in iframe['src']: + return dict(pdf_url=iframe['src']) + return dict() diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a772bd4..455c9f6 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -5,6 +5,7 @@ import os, sys, time import requests +import datetime import wayback.exception from http.client import IncompleteRead @@ -21,7 +22,7 @@ class CdxApiClient: def __init__(self, host_url="https://web.archive.org/cdx/search/cdx"): self.host_url = host_url - def lookup_latest(self, url, follow_redirects=False): + def lookup_latest(self, url, recent_only=True, follow_redirects=False): """ Looks up most recent HTTP 200 record for the given URL. @@ -29,21 +30,22 @@ class CdxApiClient: XXX: should do authorized lookup using cookie to get all fields """ + WAYBACK_ENDPOINT = "https://web.archive.org/web/" + since = datetime.date.today() - datetime.timedelta(weeks=4) params = { 'url': url, 'matchType': 'exact', 'limit': -1, 'output': 'json', } + if recent_only: + params['from'] = '%04d%02d%02d' % (since.year, since.month, since.day), if not follow_redirects: params['filter'] = 'statuscode:200' resp = requests.get(self.host_url, params=params) - if follow_redirects: - raise NotImplementedError - else: - if resp.status_code != 200: - raise CdxApiError(resp.text) + if resp.status_code != 200: + raise CdxApiError(resp.text) rj = resp.json() if len(rj) <= 1: return None @@ -58,6 +60,12 @@ class CdxApiClient: sha1b32=cdx[5], sha1hex=b32_hex(cdx[5]), ) + if follow_redirects and cdx['http_status'] in (301, 302): + resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url']) + assert resp.status_code == 200 + next_url = '/'.join(resp.url.split('/')[5:]) + assert next_url != url + return self.lookup_latest(next_url) return cdx @@ -183,11 +191,14 @@ class SavePageNowClient: status = resp.json()['status'] if status == 'success': resp = resp.json() + if resp.get('message', '').startswith('The same snapshot had been made'): + raise SavePageNowError("SPN2 re-snapshot withing short time window") break elif status == 'pending': time.sleep(1.0) else: raise SavePageNowError("SPN2 status:{} url:{}".format(status, url)) + #print(resp) return resp['resources'] diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2469df6..e2dd44c 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -5,7 +5,7 @@ import base64 import requests from http.server import BaseHTTPRequestHandler, HTTPServer -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient +from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError from sandcrawler.grobid import GrobidClient from sandcrawler.misc import gen_file_metadata from sandcrawler.html import extract_fulltext_url @@ -39,10 +39,12 @@ class FileIngester: WAYBACK_ENDPOINT = "https://web.archive.org/web/" - cdx = self.cdx_client.lookup_latest(url) + cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) if not cdx: + # TODO: refactor this to make adding new domains/patterns easier # sciencedirect.com (Elsevier) requires browser crawling (SPNv2) - if ('sciencedirect.com' in url and '.pdf' in url): + if ('sciencedirect.com' in url and '.pdf' in url) or ('osapublishing.org' in url) or ('pubs.acs.org/doi/' in url) or ('ieeexplore.ieee.org' in url and ('.pdf' in url or '/stamp/stamp.jsp' in url)): + #print(url) cdx_list = self.spn_client.save_url_now_v2(url) for cdx_url in cdx_list: if 'pdf.sciencedirectassets.com' in cdx_url and '.pdf' in cdx_url: @@ -51,8 +53,19 @@ class FileIngester: if 'osapublishing.org' in cdx_url and 'abstract.cfm' in cdx_url: cdx = self.cdx_client.lookup_latest(cdx_url) break + if 'pubs.acs.org' in cdx_url and '/doi/pdf/' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break + if 'ieeexplore.ieee.org' in cdx_url and '.pdf' in cdx_url and 'arnumber=' in cdx_url: + cdx = self.cdx_client.lookup_latest(cdx_url) + break if not cdx: - raise Exception("Failed to crawl sciencedirect.com PDF URL") + # extraction didn't work as expected; fetch whatever SPN2 got + cdx = self.cdx_client.lookup_latest(url, follow_redirects=True) + if not cdx: + raise SavePageNowError("") + sys.stderr.write("{}\n".format(cdx_list)) + raise Exception("Failed to crawl PDF URL") else: return self.spn_client.save_url_now_v1(url) @@ -87,18 +100,31 @@ class FileIngester: sys.stderr.write("CDX hit: {}\n".format(cdx_dict)) response['cdx'] = cdx_dict - response['terminal'] = dict() + # TODO: populate terminal + response['terminal'] = dict(url=cdx_dict['url'], http_status=cdx_dict['http_status']) + if not body: + response['status'] = 'null-body' + return response file_meta = gen_file_metadata(body) mimetype = cdx_dict['mimetype'] if mimetype in ('warc/revisit', 'binary/octet-stream', 'application/octet-stream'): mimetype = file_meta['mimetype'] + response['file_meta'] = file_meta if 'html' in mimetype: page_metadata = extract_fulltext_url(response['cdx']['url'], body) if page_metadata and page_metadata.get('pdf_url'): - url = page_metadata.get('pdf_url') + next_url = page_metadata.get('pdf_url') + if next_url == url: + response['status'] = 'link-loop' + return response + url = next_url continue elif page_metadata and page_metadata.get('next_url'): - url = page_metadata.get('next_url') + next_url = page_metadata.get('next_url') + if next_url == url: + response['status'] = 'link-loop' + return response + url = next_url continue else: response['terminal']['html'] = page_metadata @@ -116,7 +142,7 @@ class FileIngester: # do GROBID response['grobid'] = self.grobid_client.process_fulltext(body) - sys.stderr.write("GROBID status: {}\n".format(response['grobid']['status'])) + #sys.stderr.write("GROBID status: {}\n".format(response['grobid']['status'])) # TODO: optionally publish to Kafka here, but continue on failure (but # send a sentry exception?) @@ -129,7 +155,7 @@ class FileIngester: response['grobid'].pop('tei_xml') # Ok, now what? - sys.stderr.write("GOT TO END\n") + #sys.stderr.write("GOT TO END\n") response['status'] = "success" response['hit'] = True return response @@ -144,7 +170,8 @@ class IngestFileRequestHandler(BaseHTTPRequestHandler): length = int(self.headers.get('content-length')) request = json.loads(self.rfile.read(length).decode('utf-8')) print("Got request: {}".format(request)) - result = ingest_file(request) + ingester = FileIngester() + result = ingester.ingest_file(request) self.send_response(200) self.end_headers() self.wfile.write(json.dumps(result)) |