aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-22 14:05:24 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-22 15:53:26 -0800
commit24ef8310c106ea020a34a6cb48e2ccca4b2c3c18 (patch)
tree592aa7a97bb64d5f770abfa06e6638827f420ffe
parent04cb1a01cbd1bc4f017ebd61d8b6732ea060ee44 (diff)
downloadsandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.tar.gz
sandcrawler-24ef8310c106ea020a34a6cb48e2ccca4b2c3c18.zip
ingest: include better terminal URL/status_code/dt
Was getting a lot of "last hit" metadata for these columns.
-rw-r--r--python/sandcrawler/ingest.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 6f9ea45..6ec54f6 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -275,6 +275,14 @@ class IngestFileWorker(SandcrawlerWorker):
if not resource.hit:
result['status'] = resource.status
+ if resource.terminal_dt and resource.terminal_status_code:
+ result['terminal'] = {
+ "terminal_url": resource.terminal_url,
+ "terminal_dt": resource.terminal_dt,
+ "terminal_status_code": resource.terminal_status_code,
+ }
+ if resource.terminal_url not in result['hops']:
+ result['hops'].append(resource.terminal_url)
return result
if not resource.body: