From b150a62569a972b2719da71403b744bafa4f3fb6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 25 Jun 2020 16:33:47 -0700 Subject: 2020-05_pubmed ingest notes (short) --- notes/ingest/2020-05_pubmed.md | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 notes/ingest/2020-05_pubmed.md diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md new file mode 100644 index 0000000..36d00a1 --- /dev/null +++ b/notes/ingest/2020-05_pubmed.md @@ -0,0 +1,10 @@ + +From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1. + +Test small batch: + + zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 -- cgit v1.2.3