diff options
-rw-r--r-- | notes/ingest/2020-05_pubmed.md | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md new file mode 100644 index 0000000..36d00a1 --- /dev/null +++ b/notes/ingest/2020-05_pubmed.md @@ -0,0 +1,10 @@ + +From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1. + +Test small batch: + + zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 |