From c3cbab57fc5b27a5add399dd27dff0a91c9d9fa1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Sep 2021 18:32:39 -0700 Subject: commit old arxiv ingest notes --- notes/ingest/2020-11-04_arxiv.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 notes/ingest/2020-11-04_arxiv.md (limited to 'notes') diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md new file mode 100644 index 0000000..f9abe09 --- /dev/null +++ b/notes/ingest/2020-11-04_arxiv.md @@ -0,0 +1,12 @@ + +Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run +a crawl. + +Crawl is now done, so going to ingest, hoping to get the majority of the +millions of remaining arxiv.org PDFs. + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l + => 1,288,559 + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + -- cgit v1.2.3