From 24185837a47f305757a5c783b95ca25b709f66e3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Jan 2020 17:31:08 -0800 Subject: refactor ingest to a loop, allowing multiple hops --- python/sandcrawler/ingest.py | 73 +++++++++++++++++++++++++++++--------------- python/tests/test_ingest.py | 11 +++++-- 2 files changed, 57 insertions(+), 27 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 4b6c587..591c971 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -151,47 +151,70 @@ class IngestFileWorker(SandcrawlerWorker): result = dict(request=request, hit=False) - try: - # first hop - resource = self.find_resource(base_url, best_mimetype) + next_url = base_url + hops = [base_url] + self.max_hops = 4 + + + while len(hops) <= self.max_hops: + + result['hops'] = hops + try: + resource = self.find_resource(next_url, best_mimetype) + except SavePageNowError as e: + result['status'] = 'spn-error' + result['error_message'] = str(e) + return result + except PetaboxError as e: + result['status'] = 'petabox-error' + result['error_message'] = str(e) + return result + except CdxApiError as e: + result['status'] = 'cdx-error' + result['error_message'] = str(e) + return result + except WaybackError as e: + result['status'] = 'wayback-error' + result['error_message'] = str(e) + return result + if not resource.hit: result['status'] = resource.status return result file_meta = gen_file_metadata(resource.body) if "html" in file_meta['mimetype']: - # got landing page, try another hop + # got landing page or similar fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['html'] = fulltext_url if not fulltext_url or not 'pdf_url' in fulltext_url: result['status'] = 'no-pdf-link' + if resource.terminal_dt: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } return result print("\tlanding page URL extracted ({}): {}".format( fulltext_url.get('technique'), fulltext_url['pdf_url'], ), file=sys.stderr) - resource = self.find_resource(fulltext_url['pdf_url'], best_mimetype) - if not resource.hit: - result['status'] = resource.status + next_url = fulltext_url['pdf_url'] + if next_url in hops: + result['status'] = 'link-loop' + result['error_message'] = "repeated: {}".format(next_url) return result - file_meta = gen_file_metadata(resource.body) - except SavePageNowError as e: - result['status'] = 'spn-error' - result['error_message'] = str(e) - return result - except PetaboxError as e: - result['status'] = 'petabox-error' - result['error_message'] = str(e) - return result - except CdxApiError as e: - result['status'] = 'cdx-error' - result['error_message'] = str(e) - return result - except WaybackError as e: - result['status'] = 'wayback-error' - result['error_message'] = str(e) + hops.append(next_url) + continue + + # default is to NOT keep hopping + break + + if len(hops) >= self.max_hops: + result['status'] = "max-hops-exceeded" return result if resource.terminal_dt: @@ -201,11 +224,12 @@ class IngestFileWorker(SandcrawlerWorker): "terminal_status_code": resource.terminal_status_code, } - # must be a hit if we got this far + # fetch must be a hit if we got this far (though not necessarily an ingest hit!) assert resource.hit == True assert resource.terminal_status_code == 200 result['file_meta'] = file_meta + result['cdx'] = cdx_to_dict(resource.cdx) # other failure cases if not resource.body or file_meta['size_bytes'] == 0: @@ -221,7 +245,6 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "success" result['hit'] = True - result['cdx'] = cdx_to_dict(resource.cdx) return result diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index 8692b21..f5599e9 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -109,12 +109,19 @@ def test_ingest_landing(ingest_worker): headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, body=WARC_BODY) + # this is for second time around; don't want to fetch same landing page + # HTML again and result in a loop + responses.add(responses.GET, + 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"), + status=200, + headers={"X-Archive-Src": "liveweb-whatever.warc.gz"}, + body="") + resp = ingest_worker.process(request) print(resp) assert resp['hit'] == False - assert resp['status'] == "wrong-mimetype" + assert resp['status'] == "no-pdf-link" assert resp['request'] == request assert 'grobid' not in resp - assert resp['terminal'] -- cgit v1.2.3