From 24ef8310c106ea020a34a6cb48e2ccca4b2c3c18 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 22 Feb 2020 14:05:24 -0800 Subject: ingest: include better terminal URL/status_code/dt Was getting a lot of "last hit" metadata for these columns. --- python/sandcrawler/ingest.py | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 6f9ea45..6ec54f6 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -275,6 +275,14 @@ class IngestFileWorker(SandcrawlerWorker): if not resource.hit: result['status'] = resource.status + if resource.terminal_dt and resource.terminal_status_code: + result['terminal'] = { + "terminal_url": resource.terminal_url, + "terminal_dt": resource.terminal_dt, + "terminal_status_code": resource.terminal_status_code, + } + if resource.terminal_url not in result['hops']: + result['hops'].append(resource.terminal_url) return result if not resource.body: -- cgit v1.2.3