diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-04-04 17:12:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-04 17:12:00 -0700 |
commit | e2a98a0d47f307e59e2b50f0a3945a2a4f9caaea (patch) | |
tree | d17938ee5f895b26f30d95b36391f7e7c648bb18 | |
parent | dadb26935c4d255c5a662f1e758bcf53864f7f95 (diff) | |
download | sandcrawler-e2a98a0d47f307e59e2b50f0a3945a2a4f9caaea.tar.gz sandcrawler-e2a98a0d47f307e59e2b50f0a3945a2a4f9caaea.zip |
.ua ingest notes
-rw-r--r-- | notes/tasks/2022-03-07_ukraine_firedrill.md | 29 |
1 files changed, 29 insertions, 0 deletions
diff --git a/notes/tasks/2022-03-07_ukraine_firedrill.md b/notes/tasks/2022-03-07_ukraine_firedrill.md index 18bce5b..222f9b7 100644 --- a/notes/tasks/2022-03-07_ukraine_firedrill.md +++ b/notes/tasks/2022-03-07_ukraine_firedrill.md @@ -189,6 +189,35 @@ x URL list filtered from new OAI-PMH feed zstdcat oai_pmh_partial_dump_2022_03_01_urls.txt.zst | rg '\.ru/' | pv -l > oai_pmh_partial_dump_2022_03_01_urls.ru_tld.txt # 276k 0:00:03 [72.9k/s] + +### Landing Page Bulk Ingest + +Running these 2022-03-24, after targeted crawl completed: + + zcat /srv/fatcat/tasks/ingest_ua_pdfs.2022-03-08.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 103k 0:00:02 [36.1k/s] + + zcat /srv/fatcat/tasks/ingest_by_pdfs.2022-03-09.requests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 1.29k 0:00:00 [15.8k/s] + + zcat /srv/fatcat/tasks/ingest_ru_pdfs.2022-03-09.requests.partial.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | pv -l \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 546k 0:00:13 [40.6k/s] + +It will probably take a week or more for these to complete. + + ## Outreach - openalex |