From e1b3edd7af59fe0fd4272a4696387ea09a22a6c0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 17 Mar 2020 16:36:42 -0700 Subject: unpaywall large ingest notes --- notes/ingest/2020-02-14_unpaywall_ingest.md | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'notes') diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02-14_unpaywall_ingest.md index 0bedfdb..24779df 100644 --- a/notes/ingest/2020-02-14_unpaywall_ingest.md +++ b/notes/ingest/2020-02-14_unpaywall_ingest.md @@ -474,3 +474,13 @@ Note: will probably end up re-running the below after crawling+ingesting the abo ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json'; => 654,885 +## Batch Ingest + +Test small batch: + + head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full batch: + + cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + -- cgit v1.2.3