From e2a98a0d47f307e59e2b50f0a3945a2a4f9caaea Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 4 Apr 2022 17:12:00 -0700 Subject: .ua ingest notes --- notes/tasks/2022-03-07_ukraine_firedrill.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md index 18bce5b..222f9b7 100644 --- a/notes/tasks/2022-03-07_ukraine_firedrill.md +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -189,6 +189,35 @@ x URL list filtered from new OAI-PMH feed zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt # 276k 0:00:03 [72.9k/s] + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + + zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 103k 0:00:02 [36.1k/s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 1.29k 0:00:00 [15.8k/s] + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + ## Outreach - openalex -- cgit v1.2.3