diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 12:54:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-24 12:54:59 -0800 |
commit | 1a42a73e8c494d13e671d0fc6729665d85010669 (patch) | |
tree | 1f6fcad2a15e859183bf54d2ab678cb5c212dd74 /notes/tasks/2020-02-21_ingest_backfills.md | |
parent | cfe04b60a28e90c694d2558eadb71c0b280c40e8 (diff) | |
download | sandcrawler-1a42a73e8c494d13e671d0fc6729665d85010669.tar.gz sandcrawler-1a42a73e8c494d13e671d0fc6729665d85010669.zip |
ingest backfill notes
Diffstat (limited to 'notes/tasks/2020-02-21_ingest_backfills.md')
-rw-r--r-- | notes/tasks/2020-02-21_ingest_backfills.md | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/notes/tasks/2020-02-21_ingest_backfills.md b/notes/tasks/2020-02-21_ingest_backfills.md new file mode 100644 index 0000000..48df910 --- /dev/null +++ b/notes/tasks/2020-02-21_ingest_backfills.md @@ -0,0 +1,104 @@ + +Follow-ups to last ingest backfill. Only run these when ingest request topic is +empty, and full persist chain has run successfully. + +## Corona virus stuff + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV + +## Large OA Publishers + +Should probably check domain stats/success for all of these first. + +Would also be good to have a "randomize" option. Could fake that by dumping to +disk first. + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier + + ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer + + # ??? + ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + +## Fixed OA Publishers (small tests) + + # american archivist + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + => Expecting 2920 release objects in search queries + => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + => Expecting 42897 release objects in search queries + => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers + => Expecting 35427 release objects in search queries + => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => mixed results? + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + => Expecting 43111 release objects in search queries + => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25}) + => success, fast + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association" + => Expecting 185240 release objects in search queries + => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25}) + => no success? or mixed? skip for now + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => FIXED + => good (but slow?) + + ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University" + => Expecting 578057 release objects in search queries + => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50}) + => nothing from tsu.ru? skip for now + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + => Expecting 4602 release objects in search queries + => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + => Expecting 5690 release objects in search queries + => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + => good + + +## Fixed OA Publishers (full runs) + + # american archivist + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Expecting 2920 release objects in search queries + Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + Expecting 42986 release objects in search queries + Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + Expecting 43108 release objects in search queries + Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262}) + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + Expecting 12699 release objects in search queries + Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + Expecting 4602 release objects in search queries + Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + Expecting 5690 release objects in search queries + Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690}) + |