diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-02-22 14:05:24 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-22 15:53:26 -0800 | 
| commit | 24ef8310c106ea020a34a6cb48e2ccca4b2c3c18 (patch) | |
| tree | 592aa7a97bb64d5f770abfa06e6638827f420ffe /python | |
| parent | 04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44 (diff) | |
| download | sandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.tar.gz sandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.zip  | |
ingest: include better terminal URL/status_code/dt
Was getting a lot of "last hit" metadata for these columns.
Diffstat (limited to 'python')
| -rw-r--r-- | python/sandcrawler/ingest.py | 8 | 
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 6f9ea45..6ec54f6 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -275,6 +275,14 @@ class IngestFileWorker(SandcrawlerWorker):              if not resource.hit:                  result['status'] = resource.status +                if resource.terminal_dt and resource.terminal_status_code: +                    result['terminal'] = { +                        "terminal_url": resource.terminal_url, +                        "terminal_dt": resource.terminal_dt, +                        "terminal_status_code": resource.terminal_status_code, +                    } +                    if resource.terminal_url not in result['hops']: +                        result['hops'].append(resource.terminal_url)                  return result              if not resource.body:  | 
