aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 17:31:08 -0800
commit24185837a47f305757a5c783b95ca25b709f66e3 (patch)
treee71e179e93932ad04ba14dbc6308d3e4deb3eeb7 /python/sandcrawler
parent00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (diff)
downloadsandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.tar.gz
sandcrawler-24185837a47f305757a5c783b95ca25b709f66e3.zip
refactor ingest to a loop, allowing multiple hops
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/ingest.py73
1 files changed, 48 insertions, 25 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 4b6c587..591c971 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -151,47 +151,70 @@ class IngestFileWorker(SandcrawlerWorker):
result = dict(request=request, hit=False)
- try:
- # first hop
- resource = self.find_resource(base_url, best_mimetype)
+ next_url = base_url
+ hops = [base_url]
+ self.max_hops = 4
+
+
+ while len(hops) <= self.max_hops:
+
+ result['hops'] = hops
+ try:
+ resource = self.find_resource(next_url, best_mimetype)
+ except SavePageNowError as e:
+ result['status'] = 'spn-error'
+ result['error_message'] = str(e)
+ return result
+ except PetaboxError as e:
+ result['status'] = 'petabox-error'
+ result['error_message'] = str(e)
+ return result
+ except CdxApiError as e:
+ result['status'] = 'cdx-error'
+ result['error_message'] = str(e)
+ return result
+ except WaybackError as e:
+ result['status'] = 'wayback-error'
+ result['error_message'] = str(e)
+ return result
+
if not resource.hit:
result['status'] = resource.status
return result
file_meta = gen_file_metadata(resource.body)
if "html" in file_meta['mimetype']:
- # got landing page, try another hop
+ # got landing page or similar
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['html'] = fulltext_url
if not fulltext_url or not 'pdf_url' in fulltext_url:
result['status'] = 'no-pdf-link'
+ if resource.terminal_dt:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
return result
print("\tlanding page URL extracted ({}): {}".format(
fulltext_url.get('technique'),
fulltext_url['pdf_url'],
),
file=sys.stderr)
- resource = self.find_resource(fulltext_url['pdf_url'], best_mimetype)
- if not resource.hit:
- result['status'] = resource.status
+ next_url = fulltext_url['pdf_url']
+ if next_url in hops:
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
return result
- file_meta = gen_file_metadata(resource.body)
- except SavePageNowError as e:
- result['status'] = 'spn-error'
- result['error_message'] = str(e)
- return result
- except PetaboxError as e:
- result['status'] = 'petabox-error'
- result['error_message'] = str(e)
- return result
- except CdxApiError as e:
- result['status'] = 'cdx-error'
- result['error_message'] = str(e)
- return result
- except WaybackError as e:
- result['status'] = 'wayback-error'
- result['error_message'] = str(e)
+ hops.append(next_url)
+ continue
+
+ # default is to NOT keep hopping
+ break
+
+ if len(hops) >= self.max_hops:
+ result['status'] = "max-hops-exceeded"
return result
if resource.terminal_dt:
@@ -201,11 +224,12 @@ class IngestFileWorker(SandcrawlerWorker):
"terminal_status_code": resource.terminal_status_code,
}
- # must be a hit if we got this far
+ # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
assert resource.hit == True
assert resource.terminal_status_code == 200
result['file_meta'] = file_meta
+ result['cdx'] = cdx_to_dict(resource.cdx)
# other failure cases
if not resource.body or file_meta['size_bytes'] == 0:
@@ -221,7 +245,6 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "success"
result['hit'] = True
- result['cdx'] = cdx_to_dict(resource.cdx)
return result