aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py18
1 files changed, 12 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index fe07a89..5dde205 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -136,6 +136,10 @@ class IngestFileWorker(SandcrawlerWorker):
def process(self, request):
+ # backwards compatibility
+ if request.get('ingest_type') in ('file', None):
+ reqeust['ingest_type'] = 'pdf'
+
# for now, only pdf ingest is implemented
assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
@@ -184,18 +188,20 @@ class IngestFileWorker(SandcrawlerWorker):
file_meta = gen_file_metadata(resource.body)
if "html" in file_meta['mimetype']:
+
# got landing page or similar
+ if resource.terminal_dt:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+
fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
result['html'] = fulltext_url
if not fulltext_url:
result['status'] = 'no-pdf-link'
- if resource.terminal_dt:
- result['terminal'] = {
- "terminal_url": resource.terminal_url,
- "terminal_dt": resource.terminal_dt,
- "terminal_status_code": resource.terminal_status_code,
- }
return result
next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
assert next_url