From d3638a9fd9ed11fb4484038852f8e02b2f5a7b41 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 22 Mar 2022 16:03:46 -0700 Subject: various ingest/task notes --- notes/ingest/2022-03_doaj.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'notes/ingest/2022-03_doaj.md') diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md index bace480..9722459 100644 --- a/notes/ingest/2022-03_doaj.md +++ b/notes/ingest/2022-03_doaj.md @@ -264,3 +264,15 @@ Create seedlist: Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will re-ingest when that completes (a week or two?). + + +## Bulk Ingest + +After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up. + + # 2022-03-22 + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + -- cgit v1.2.3