diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 12:59:02 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 12:59:02 -0800 | 
| commit | d31f491f0a297c88594b9e8f2b0baa25fa10fd63 (patch) | |
| tree | 7d54f9581714ceef1fe4865a6acd5537ca476cb0 /python | |
| parent | 03bfb530e782603e597368f765adee6db7986a1a (diff) | |
| download | sandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.tar.gz sandcrawler-d31f491f0a297c88594b9e8f2b0baa25fa10fd63.zip  | |
improve ingest robustness (for legacy requests)
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ingest.py | 18 | 
1 files changed, 12 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index fe07a89..5dde205 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -136,6 +136,10 @@ class IngestFileWorker(SandcrawlerWorker):      def process(self, request): +        # backwards compatibility +        if request.get('ingest_type') in ('file', None): +            reqeust['ingest_type'] = 'pdf' +          # for now, only pdf ingest is implemented          assert request.get('ingest_type') == "pdf"          ingest_type = request.get('ingest_type') @@ -184,18 +188,20 @@ class IngestFileWorker(SandcrawlerWorker):              file_meta = gen_file_metadata(resource.body)              if "html" in file_meta['mimetype']: +                  # got landing page or similar +                if resource.terminal_dt: +                    result['terminal'] = { +                        "terminal_url": resource.terminal_url, +                        "terminal_dt": resource.terminal_dt, +                        "terminal_status_code": resource.terminal_status_code, +                    } +                  fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)                  result['html'] = fulltext_url                  if not fulltext_url:                      result['status'] = 'no-pdf-link' -                    if resource.terminal_dt: -                        result['terminal'] = { -                            "terminal_url": resource.terminal_url, -                            "terminal_dt": resource.terminal_dt, -                            "terminal_status_code": resource.terminal_status_code, -                        }                      return result                  next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')                  assert next_url  | 
