From dc0841329257f037260b225b66ef80a73fbebea7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 13 Apr 2020 13:20:47 -0700 Subject: MAG import notes --- notes/ingest/2020-03-04_mag.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'notes/ingest') diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md index a5624c2..97594c8 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03-04_mag.md @@ -393,3 +393,16 @@ heritrix): # in sandcrawler pipenv ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json + +## Bulk Ingest of Heritrix Content + +Small sample: + + head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + 2020-04-07 12:19 (pacific): 11,703,871 + -- cgit v1.2.3