aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
commit24185837a47f305757a5c783b95ca25b709f66e3 (patch)
treee71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python
parent00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff)
downloadsandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz
sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip
refactor ingest to a loop, allowing multiple hops
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py73
-rw-r--r--python/tests/test_ingest.py11
2 files changed, 57 insertions, 27 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 4b6c587..591c971 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -151,47 +151,70 @@ class IngestFileWorker(SandcrawlerWorker):
result = dict(request=request, hit=False)
- try:
- # first hop
- resource = self.find_resource(base_url, best_mimetype)
+ next_url = base_url
+ hops = [base_url]
+ self.max_hops = 4
+
+
+ while len(hops) <= self.max_hops:
+
+ result['hops'] = hops
+ try:
+ resource = self.find_resource(next_url, best_mimetype)
+ except SavePageNowError as e:
+ result['status'] = 'spn-error'
+ result['error_message'] = str(e)
+ return result
+ except PetaboxError as e:
+ result['status'] = 'petabox-error'
+ result['error_message'] = str(e)
+ return result
+ except CdxApiError as e:
+ result['status'] = 'cdx-error'
+ result['error_message'] = str(e)
+ return result
+ except WaybackError as e:
+ result['status'] = 'wayback-error'
+ result['error_message'] = str(e)
+ return result
+
if not resource.hit:
result['status'] = resource.status
return result
file_meta = gen_file_metadata(resource.body)
if "html" in file_meta['mimetype']:
- # got landing page, try another hop
+ # got landing page or similar
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['html'] = fulltext_url
if not fulltext_url or not 'pdf_url' in fulltext_url:
result['status'] = 'no-pdf-link'
+ if resource.terminal_dt:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
return result
print("\tlanding page URL extracted ({}): {}".format(
fulltext_url.get('technique'),
fulltext_url['pdf_url'],
),
file=sys.stderr)
- resource = self.find_resource(fulltext_url['pdf_url'], best_mimetype)
- if not resource.hit:
- result['status'] = resource.status
+ next_url = fulltext_url['pdf_url']
+ if next_url in hops:
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
return result
- file_meta = gen_file_metadata(resource.body)
- except SavePageNowError as e:
- result['status'] = 'spn-error'
- result['error_message'] = str(e)
- return result
- except PetaboxError as e:
- result['status'] = 'petabox-error'
- result['error_message'] = str(e)
- return result
- except CdxApiError as e:
- result['status'] = 'cdx-error'
- result['error_message'] = str(e)
- return result
- except WaybackError as e:
- result['status'] = 'wayback-error'
- result['error_message'] = str(e)
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result['status'] = "max-hops-exceeded"
return result
if resource.terminal_dt:
@@ -201,11 +224,12 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_status_code": resource.terminal_status_code,
}
- # must be a hit if we got this far
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
assert resource.hit == True
assert resource.terminal_status_code == 200
result['file_meta'] = file_meta
+ result['cdx'] = cdx_to_dict(resource.cdx)
# other failure cases
if not resource.body or file_meta['size_bytes'] == 0:
@@ -221,7 +245,6 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "success"
result['hit'] = True
- result['cdx'] = cdx_to_dict(resource.cdx)
return result
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 8692b21..f5599e9 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -109,12 +109,19 @@ def test_ingest_landing(ingest_worker):
headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
body=WARC_BODY)
+ # this is for second time around; don't want to fetch same landing page
+ # HTML again and result in a loop
+ responses.add(responses.GET,
+ 'https://web.archive.org/web/{}id_/{}'.format("20180326070330", TARGET + "/redirect"),
+ status=200,
+ headers={"X-Archive-Src": "liveweb-whatever.warc.gz"},
+ body="<html></html>")
+
resp = ingest_worker.process(request)
print(resp)
assert resp['hit'] == False
- assert resp['status'] == "wrong-mimetype"
+ assert resp['status'] == "no-pdf-link"
assert resp['request'] == request
assert 'grobid' not in resp
- assert resp['terminal']