diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-22 14:05:24 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-22 15:53:26 -0800 |
commit | 24ef8310c106ea020a34a6cb48e2ccca4b2c3c18 (patch) | |
tree | 592aa7a97bb64d5f770abfa06e6638827f420ffe | |
parent | 04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44 (diff) | |
download | sandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.tar.gz sandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.zip |
ingest: include better terminal URL/status_code/dt
Was getting a lot of "last hit" metadata for these columns.
-rw-r--r-- | python/sandcrawler/ingest.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 6f9ea45..6ec54f6 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -275,6 +275,14 @@ class IngestFileWorker(SandcrawlerWorker): if not resource.hit: result['status'] = resource.status + if resource.terminal_dt and resource.terminal_status_code: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } + if resource.terminal_url not in result['hops']: + result['hops'].append(resource.terminal_url) return result if not resource.body: |