From 568da1363626d07d0956fb8a60bb7d7fd9054d83 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 Dec 2020 16:38:56 -0800 Subject: commit sept 2020 scielo ingest notes --- notes/ingest/2020-09_scielo.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 notes/ingest/2020-09_scielo.md diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md new file mode 100644 index 0000000..4ec6fbd --- /dev/null +++ b/notes/ingest/2020-09_scielo.md @@ -0,0 +1,21 @@ + +As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing +fatcat releases with no IA copy and with `publisher_type:scielo`. There are +200k+ such releases. + +It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008 + +Could try XML ingest of these! + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json + Expecting 212529 release objects in search queries + +Enqueue + + cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done 2020-09-14 + -- cgit v1.2.3