diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 12:59:02 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 12:59:02 -0800 |
commit | d31f491f0a297c88594b9e8f2b0baa25fa10fd63 (patch) | |
tree | 7d54f9581714ceef1fe4865a6acd5537ca476cb0 | |
parent | 03bfb530e782603e597368f765adee6db7986a1a (diff) | |
download | sandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.tar.gz sandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.zip |
improve ingest robustness (for legacy requests)
-rw-r--r-- | python/sandcrawler/ingest.py | 18 |
1 files changed, 12 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index fe07a89..5dde205 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -136,6 +136,10 @@ class IngestFileWorker(SandcrawlerWorker): def process(self, request): + # backwards compatibility + if request.get('ingest_type') in ('file', None): + reqeust['ingest_type'] = 'pdf' + # for now, only pdf ingest is implemented assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') @@ -184,18 +188,20 @@ class IngestFileWorker(SandcrawlerWorker): file_meta = gen_file_metadata(resource.body) if "html" in file_meta['mimetype']: + # got landing page or similar + if resource.terminal_dt: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } + fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) result['html'] = fulltext_url if not fulltext_url: result['status'] = 'no-pdf-link' - if resource.terminal_dt: - result['terminal'] = { - "terminal_url": resource.terminal_url, - "terminal_dt": resource.terminal_dt, - "terminal_status_code": resource.terminal_status_code, - } return result next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') assert next_url |