aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 12:59:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 12:59:02 -0800
commitd31f491f0a297c88594b9e8f2b0baa25fa10fd63 (patch)
tree7d54f9581714ceef1fe4865a6acd5537ca476cb0
parent03bfb530e782603e597368f765adee6db7986a1a (diff)
downloadsandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.tar.gz
sandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.zip
improve ingest robustness (for legacy requests)
-rw-r--r--python/sandcrawler/ingest.py18
1 files changed, 12 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index fe07a89..5dde205 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -136,6 +136,10 @@ class IngestFileWorker(SandcrawlerWorker):
def process(self, request):
+ # backwards compatibility
+ if request.get('ingest_type') in ('file', None):
+ reqeust['ingest_type'] = 'pdf'
+
# for now, only pdf ingest is implemented
assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
@@ -184,18 +188,20 @@ class IngestFileWorker(SandcrawlerWorker):
file_meta = gen_file_metadata(resource.body)
if "html" in file_meta['mimetype']:
+
# got landing page or similar
+ if resource.terminal_dt:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['html'] = fulltext_url
if not fulltext_url:
result['status'] = 'no-pdf-link'
- if resource.terminal_dt:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
return result
next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
assert next_url