diff options
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2020-03-04_mag.md | 13 |
1 files changed, 13 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md index a5624c2..97594c8 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03-04_mag.md @@ -393,3 +393,16 @@ heritrix): # in sandcrawler pipenv ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json + +## Bulk Ingest of Heritrix Content + +Small sample: + + head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + 2020-04-07 12:19 (pacific): 11,703,871 + |