From 720a45a1d9eea673e0f10d3a7dac0ca85fb913d3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 3 Mar 2020 10:24:43 -0800 Subject: update (and move) ingest notes --- notes/ingest/2020-02-04_ingest_backfills.md | 148 +++++++++++++++++ notes/ingest/2020-02-14_unpaywall_ingest.md | 60 +++++++ notes/ingest/2020-02-18_ingest_backfills.md | 42 +++++ notes/ingest/2020-02-21_ingest_backfills.md | 104 ++++++++++++ notes/ingest/2020-02-22_fixed_domain.txt | 246 ++++++++++++++++++++++++++++ notes/ingest/2020-03-02_ingests.txt | 174 ++++++++++++++++++++ notes/tasks/2020-02-04_ingest_backfills.md | 148 ----------------- notes/tasks/2020-02-18_ingest_backfills.md | 42 ----- notes/tasks/2020-02-21_ingest_backfills.md | 104 ------------ 9 files changed, 774 insertions(+), 294 deletions(-) create mode 100644 notes/ingest/2020-02-04_ingest_backfills.md create mode 100644 notes/ingest/2020-02-14_unpaywall_ingest.md create mode 100644 notes/ingest/2020-02-18_ingest_backfills.md create mode 100644 notes/ingest/2020-02-21_ingest_backfills.md create mode 100644 notes/ingest/2020-02-22_fixed_domain.txt create mode 100644 notes/ingest/2020-03-02_ingests.txt delete mode 100644 notes/tasks/2020-02-04_ingest_backfills.md delete mode 100644 notes/tasks/2020-02-18_ingest_backfills.md delete mode 100644 notes/tasks/2020-02-21_ingest_backfills.md (limited to 'notes') diff --git a/notes/ingest/2020-02-04_ingest_backfills.md b/notes/ingest/2020-02-04_ingest_backfills.md new file mode 100644 index 0000000..73a42ef --- /dev/null +++ b/notes/ingest/2020-02-04_ingest_backfills.md @@ -0,0 +1,148 @@ + + +## Using Fatcat Tool + +Want to enqueue some backfill URLs to crawl, now that SPNv2 is on the mend. + +Example dry-run: + + ./fatcat_ingest.py --dry-run --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife + +Big OA from 2020 (past month): + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 158 release objects in search queries + Counter({'ingest_request': 158, 'estimate': 158, 'kafka': 158, 'elasticsearch_release': 158}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name elife + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 2312 release objects in search queries + Counter({'kafka': 2312, 'ingest_request': 2312, 'elasticsearch_release': 2312, 'estimate': 2312}) + + # note: did 100 first to test + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name plos + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 1185 release objects in search queries + Counter({'estimate': 1185, 'ingest_request': 1185, 'elasticsearch_release': 1185, 'kafka': 1185}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 89 release objects in search queries + Counter({'elasticsearch_release': 89, 'estimate': 89, 'ingest_request': 89, 'kafka': 89}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher ieee + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 499 release objects in search queries + Counter({'kafka': 499, 'ingest_request': 499, 'estimate': 499, 'elasticsearch_release': 499}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name bmj + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 28 release objects in search queries + Counter({'elasticsearch_release': 28, 'ingest_request': 28, 'kafka': 28, 'estimate': 28}) + + ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 6225 release objects in search queries + Counter({'estimate': 6225, 'kafka': 500, 'elasticsearch_release': 500, 'ingest_request': 500}) + + ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 2920 release objects in search queries + Counter({'estimate': 2920, 'elasticsearch_release': 1001, 'ingest_request': 1000, 'kafka': 1000}) + +Hip corona virus papers: + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 5332 release objects in search queries + Counter({'estimate': 5332, 'elasticsearch_release': 2159, 'ingest_request': 2000, 'kafka': 2000}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 110 release objects in search queries + Counter({'ingest_request': 110, 'kafka': 110, 'elasticsearch_release': 110, 'estimate': 110}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 589 release objects in search queries + Counter({'estimate': 589, 'elasticsearch_release': 589, 'ingest_request': 552, 'kafka': 552}) + + +Mixed eLife results: + + ["wrong-mimetype",null,"https://elifesciences.org/articles/54551"] + ["success",null,"https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNTE2OTEvZWxpZmUtNTE2OTEtdjEucGRm/elife-51691-v1.pdf?_hash=Jp1cLog1NzIlU%2BvjgLdbM%2BuphOwe5QWUn%2F97tbQBNG4%3D"] + +## Re-Request Failed + +Select some failed injest request rows to re-enqueue: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) TO '/grande/snapshots/reingest_spn2cdx_20200205.rows.json'; + -- 1536 rows + +Transform back to full requests: + + ./scripts/ingestrequest_row2json.py reingest_spn2cdx_20200205.rows.json > reingest_spn2cdx_20200205.json + +Push into kafka (on a kafka broker node): + + cat ~/reingest_spn2cdx_20200205.json | jq . -c | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests -p -1 + +More: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'error:%' + ) TO '/grande/snapshots/reingest_spn2err1_20200205.rows.json'; + -- COPY 1516 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-error%' + ) TO '/grande/snapshots/reingest_spn2err2_20200205.rows.json'; + -- COPY 16678 + +The next large ones to try would be `wayback-error` and `cdx-error`, though +these are pretty generic. Could go kafka output to try and understand those +error classes better. + +Oof, as a mistake enqueued to partition 1 instead of -1 (random), so these will +take a week or more to actually process. Re-enqueued as -1; ingesting from +wayback is pretty fast, this should result mostly wayback ingests. Caught up by +end of weekend? + +## Check Coverages + +As follow-ups: + + elife: https://fatcat.wiki/container/en4qj5ijrbf5djxx7p5zzpjyoq/coverage + => 2020-02-24: 7187 / 8101 = 88% preserved + archivist: https://fatcat.wiki/container/zpobyv4vbranllc7oob56tgci4/coverage + => 85 preserved + => 2020-02-24: 85 / 3005 preserved (TODO) + jcancer: https://fatcat.wiki/container/nkkzpwht7jd3zdftc6gq4eoeey/coverage + => 2020 preserved + => 2520 preserved + => 2020-02-24: 2700 / 2766 preserved + plos: https://fatcat.wiki/container/23nqq3odsjhmbi5tqavvcn7cfm/coverage + => 2020-02-24: 7580 / 7730 = 98% preserved + diff --git a/notes/ingest/2020-02-14_unpaywall_ingest.md b/notes/ingest/2020-02-14_unpaywall_ingest.md new file mode 100644 index 0000000..df4795b --- /dev/null +++ b/notes/ingest/2020-02-14_unpaywall_ingest.md @@ -0,0 +1,60 @@ + +## Stats and Things + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt + +## Transform + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null + => 22M 1:31:25 [ 4k/s] + +Shard it into batches of roughly 1 million (all are 1098096 +/- 1): + + zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json + +Test ingest: + + head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Add a single batch like: + + cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Progress/Status + +There are 21,961,928 lines total, in batches of 1,098,097. + + unpaywall_snapshot_2019-11-22.ingest_request.split_00.json + => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined) + => 2020-02-25 10:35 local: 0 + unpaywall_snapshot_2019-11-22.ingest_request.split_01.json + unpaywall_snapshot_2019-11-22.ingest_request.split_02.json + unpaywall_snapshot_2019-11-22.ingest_request.split_03.json + unpaywall_snapshot_2019-11-22.ingest_request.split_04.json + => 2020-02-25 11:26 local: 4,388,997 + => 2020-02-25 10:14 local: 1,115,821 + => 2020-02-26 16:00 local: 265,116 + unpaywall_snapshot_2019-11-22.ingest_request.split_05.json + unpaywall_snapshot_2019-11-22.ingest_request.split_06.json + unpaywall_snapshot_2019-11-22.ingest_request.split_07.json + unpaywall_snapshot_2019-11-22.ingest_request.split_08.json + unpaywall_snapshot_2019-11-22.ingest_request.split_09.json + => 2020-02-26 16:01 local: 6,843,708 + => 2020-02-26 16:31 local: 4,839,618 + => 2020-02-28 10:30 local: 2,619,319 + unpaywall_snapshot_2019-11-22.ingest_request.split_10.json + unpaywall_snapshot_2019-11-22.ingest_request.split_11.json + unpaywall_snapshot_2019-11-22.ingest_request.split_12.json + unpaywall_snapshot_2019-11-22.ingest_request.split_13.json + unpaywall_snapshot_2019-11-22.ingest_request.split_14.json + unpaywall_snapshot_2019-11-22.ingest_request.split_15.json + unpaywall_snapshot_2019-11-22.ingest_request.split_16.json + unpaywall_snapshot_2019-11-22.ingest_request.split_17.json + unpaywall_snapshot_2019-11-22.ingest_request.split_18.json + unpaywall_snapshot_2019-11-22.ingest_request.split_19.json + => 2020-02-28 10:50 local: 13,551,887 + => 2020-03-01 23:38 local: 4,521,076 + => 2020-03-02 10:45 local: 2,827,071 + => 2020-03-02 21:06 local: 1,257,176 + added about 500k bulk re-ingest to try and work around cdx errors + => 2020-03-02 21:30 local: 1,733,654 diff --git a/notes/ingest/2020-02-18_ingest_backfills.md b/notes/ingest/2020-02-18_ingest_backfills.md new file mode 100644 index 0000000..1ab18f4 --- /dev/null +++ b/notes/ingest/2020-02-18_ingest_backfills.md @@ -0,0 +1,42 @@ + +Select: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-error%' + ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json'; + => COPY 6537 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'wayback-error' + ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json'; + => COPY 33022 + +Transform: + + ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json + ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json + +Push to kafka: + + cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +Many had null `ingest_request_source`, so won't actually import into fatcat: + + bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n + 1 "savepapernow-web" + 112 "fatcat-ingest-container" + 11750 "fatcat-changelog" + 21159 null + diff --git a/notes/ingest/2020-02-21_ingest_backfills.md b/notes/ingest/2020-02-21_ingest_backfills.md new file mode 100644 index 0000000..48df910 --- /dev/null +++ b/notes/ingest/2020-02-21_ingest_backfills.md @@ -0,0 +1,104 @@ + +Follow-ups to last ingest backfill. Only run these when ingest request topic is +empty, and full persist chain has run successfully. + +## Corona virus stuff + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV + +## Large OA Publishers + +Should probably check domain stats/success for all of these first. + +Would also be good to have a "randomize" option. Could fake that by dumping to +disk first. + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier + + ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer + + # ??? + ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + +## Fixed OA Publishers (small tests) + + # american archivist + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + => Expecting 2920 release objects in search queries + => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + => Expecting 42897 release objects in search queries + => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers + => Expecting 35427 release objects in search queries + => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => mixed results? + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + => Expecting 43111 release objects in search queries + => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25}) + => success, fast + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association" + => Expecting 185240 release objects in search queries + => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25}) + => no success? or mixed? skip for now + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => FIXED + => good (but slow?) + + ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University" + => Expecting 578057 release objects in search queries + => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50}) + => nothing from tsu.ru? skip for now + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + => Expecting 4602 release objects in search queries + => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + => Expecting 5690 release objects in search queries + => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + => good + + +## Fixed OA Publishers (full runs) + + # american archivist + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Expecting 2920 release objects in search queries + Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + Expecting 42986 release objects in search queries + Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + Expecting 43108 release objects in search queries + Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262}) + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + Expecting 12699 release objects in search queries + Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + Expecting 4602 release objects in search queries + Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + Expecting 5690 release objects in search queries + Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690}) + diff --git a/notes/ingest/2020-02-22_fixed_domain.txt b/notes/ingest/2020-02-22_fixed_domain.txt new file mode 100644 index 0000000..a60de42 --- /dev/null +++ b/notes/ingest/2020-02-22_fixed_domain.txt @@ -0,0 +1,246 @@ + +www.degruyter.com + + "/view/books/" didn't have citation_pdf_url, so added custom URL rule. + + Not sure why redirect-loop happening, but isn't with current live ingest + tool? + + domain | status | count + -------------------+-------------------------+------- + www.degruyter.com | redirect-loop | 22023 + www.degruyter.com | no-pdf-link | 8773 + www.degruyter.com | no-capture | 8617 + www.degruyter.com | success | 840 + www.degruyter.com | link-loop | 59 + www.degruyter.com | terminal-bad-status | 23 + www.degruyter.com | wrong-mimetype | 12 + www.degruyter.com | spn-error | 4 + www.degruyter.com | spn2-cdx-lookup-failure | 4 + www.degruyter.com | spn2-error:proxy-error | 1 + www.degruyter.com | spn-remote-error | 1 + www.degruyter.com | gateway-timeout | 1 + www.degruyter.com | petabox-error | 1 + (13 rows) + +www.frontiersin.org + + no pdf link + + seems to live ingest fine? files served from "*.blob.core.windows.net" + no fix, just re-ingest. + + domain | status | count + ---------------------+-------------------------+------- + www.frontiersin.org | no-pdf-link | 17503 + www.frontiersin.org | terminal-bad-status | 6696 + www.frontiersin.org | wayback-error | 203 + www.frontiersin.org | no-capture | 20 + www.frontiersin.org | spn-error | 6 + www.frontiersin.org | gateway-timeout | 3 + www.frontiersin.org | wrong-mimetype | 3 + www.frontiersin.org | spn2-cdx-lookup-failure | 2 + www.frontiersin.org | spn2-error:job-failed | 2 + www.frontiersin.org | spn-remote-error | 1 + www.frontiersin.org | cdx-error | 1 + (11 rows) + +www.mdpi.com + + terminal-bad-status + + Seems to ingest fine live? No fix, just re-ingest. + + domain | status | count + --------------+-------------------------+------- + www.mdpi.com | terminal-bad-status | 13866 + www.mdpi.com | wrong-mimetype | 2693 + www.mdpi.com | wayback-error | 513 + www.mdpi.com | redirect-loop | 505 + www.mdpi.com | success | 436 + www.mdpi.com | no-capture | 214 + www.mdpi.com | no-pdf-link | 43 + www.mdpi.com | spn2-cdx-lookup-failure | 34 + www.mdpi.com | gateway-timeout | 3 + www.mdpi.com | petabox-error | 2 + (10 rows) + +www.ahajournals.org | no-pdf-link | 5727 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.ahajournals.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.ahajournals.org%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ---------------------+----------------+------- + www.ahajournals.org | no-pdf-link | 5738 + www.ahajournals.org | wrong-mimetype | 84 + (2 rows) + + + pdf | https://doi.org/10.1161/circ.110.19.2977 | 2020-02-23 00:28:55.256296+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/str.49.suppl_1.tp403 | 2020-02-23 00:27:34.950059+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/str.49.suppl_1.tp168 | 2020-02-23 00:25:54.611271+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/jaha.119.012131 | 2020-02-23 00:24:44.244511+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + + Ah, the ol' annoying 'cookieAbsent'. Works with live SPNv2 via soft-404 + detection, but that status wasn't coming through, and needed custom + pdf-link detection. + + FIXED: added pdf-link detection + +ehp.niehs.nih.gov | no-pdf-link | 5772 + + simple custom URL format. but are they also blocking? + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'ehp.niehs.nih.gov' + GROUP BY domain, status + ORDER BY COUNT DESC; + + domain | status | count + -------------------+----------------+------- + ehp.niehs.nih.gov | no-pdf-link | 5791 + ehp.niehs.nih.gov | wrong-mimetype | 11 + (2 rows) + + FIXED: mostly just slow, custom URL seems to work + +journals.tsu.ru | no-pdf-link | 4404 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'journals.tsu.ru' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%journals.tsu.ru%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + -----------------+----------------+------- + journals.tsu.ru | no-pdf-link | 4409 + journals.tsu.ru | success | 1 + journals.tsu.ru | wrong-mimetype | 1 + (3 rows) + + + pdf | https://doi.org/10.17223/18572685/57/3 | 2020-02-23 00:45:49.003593+00 | f | no-pdf-link | http://journals.tsu.ru/rusin/&journal_page=archive&id=1907&article_id=42847 | 20200213132322 | 200 | + pdf | https://doi.org/10.17223/17267080/71/4 | 2020-02-23 00:31:25.715416+00 | f | no-pdf-link | http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405 | 20200211151825 | 200 | + pdf | https://doi.org/10.17223/15617793/399/33 | 2020-02-23 00:29:45.414865+00 | f | no-pdf-link | http://journals.tsu.ru/vestnik/&journal_page=archive&id=1322&article_id=24619 | 20200208152715 | 200 | + pdf | https://doi.org/10.17223/19988613/58/15 | 2020-02-23 00:25:24.402838+00 | f | no-pdf-link | http://journals.tsu.ru//history/&journal_page=archive&id=1827&article_id=40501 | 20200212200320 | 200 | + + FIXED: simple new custom PDF link pattern + +www.cogentoa.com | no-pdf-link | 4282 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.cogentoa.com' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.cogentoa.com%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ------------------+-------------+------- + www.cogentoa.com | no-pdf-link | 4296 + (1 row) + + pdf | https://doi.org/10.1080/23311932.2015.1022632 | 2020-02-23 01:06:14.040013+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311932.2015.1022632 | 20200208054228 | 200 | + pdf | https://doi.org/10.1080/23322039.2020.1730079 | 2020-02-23 01:04:53.754117+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23322039.2020.1730079 | 20200223010431 | 200 | + pdf | https://doi.org/10.1080/2331186x.2018.1460901 | 2020-02-23 01:04:03.47563+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/2331186X.2018.1460901 | 20200207200958 | 200 | + pdf | https://doi.org/10.1080/23311975.2017.1412873 | 2020-02-23 01:03:08.063545+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 | 20200209034602 | 200 | + pdf | https://doi.org/10.1080/23311916.2017.1293481 | 2020-02-23 01:02:42.868424+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311916.2017.1293481 | 20200208101623 | 200 | + + FIXED: simple custom URL-based pattern + +chemrxiv.org | no-pdf-link | 4186 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'chemrxiv.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%chemrxiv.org%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + --------------+-------------------------+------- + chemrxiv.org | no-pdf-link | 4202 + chemrxiv.org | wrong-mimetype | 64 + chemrxiv.org | wayback-error | 14 + chemrxiv.org | success | 12 + chemrxiv.org | terminal-bad-status | 4 + chemrxiv.org | spn2-cdx-lookup-failure | 1 + + pdf | https://doi.org/10.26434/chemrxiv.9912812.v1 | 2020-02-23 01:08:34.585084+00 | f | no-pdf-link | https://chemrxiv.org/articles/Proximity_Effect_in_Crystalline_Framework_Materials_Stacking-Induced_Functionality_in_MOFs_and_COFs/9912812/1 | 20200215072929 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.7150097 | 2020-02-23 01:05:48.957624+00 | f | no-pdf-link | https://chemrxiv.org/articles/Systematic_Engineering_of_a_Protein_Nanocage_for_High-Yield_Site-Specific_Modification/7150097 | 20200213002430 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.7833500.v1 | 2020-02-23 00:55:41.013109+00 | f | no-pdf-link | https://chemrxiv.org/articles/Formation_of_Neutral_Peptide_Aggregates_Studied_by_Mass_Selective_IR_Action_Spectroscopy/7833500/1 | 20200210131343 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.8146103 | 2020-02-23 00:52:00.193328+00 | f | no-pdf-link | https://chemrxiv.org/articles/On-Demand_Guest_Release_from_MOF-5_Sealed_with_Nitrophenylacetic_Acid_Photocapping_Groups/8146103 | 20200207215449 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.10101419 | 2020-02-23 00:46:14.086913+00 | f | no-pdf-link | https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 | 20200214044153 | 200 | + + FIXED: complex JSON PDF url extraction; maybe for all figshare? + +TODO: +x many datacite prefixes go to IRs, but have is_oa:false. we should probably crawl by default based on release_type + => fatcat branch bnewbold-more-ingest +- re-ingest all degruyter (doi_prefix:10.1515) + 1456169 doi:10.1515\/* + 89942 doi:10.1515\/* is_oa:true + 36350 doi:10.1515\/* in_ia:false is_oa:true + 1290830 publisher:Gruyter + 88944 publisher:Gruyter is_oa:true + 40034 publisher:Gruyter is_oa:true in_ia:false +- re-ingest all frontiersin + 248165 publisher:frontiers + 161996 publisher:frontiers is_oa:true + 36093 publisher:frontiers is_oa:true in_ia:false + 121001 publisher:frontiers in_ia:false +- re-ingest all mdpi + 43114 publisher:mdpi is_oa:true in_ia:false +- re-ingest all ahajournals.org + 132000 doi:10.1161\/* + 6606 doi:10.1161\/* in_ia:false is_oa:true + 81349 publisher:"American Heart Association" + 5986 publisher:"American Heart Association" is_oa:true in_ia:false +- re-ingest all ehp.niehs.nih.gov + 25522 doi:10.1289\/* + 15315 publisher:"Environmental Health Perspectives" + 8779 publisher:"Environmental Health Perspectives" in_ia:false + 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true +- re-ingest all journals.tsu.ru + 12232 publisher:"Tomsk State University" + 11668 doi:10.17223\/* + 4861 publisher:"Tomsk State University" in_ia:false is_oa:true +- re-ingest all www.cogentoa.com + 3421898 doi:10.1080\/* + 4602 journal:cogent is_oa:true in_ia:false + 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain) +- re-ingest chemrxiv + 8281 doi:10.26434\/chemrxiv* + 6918 doi:10.26434\/chemrxiv* in_ia:false + +Submit all the above with limits of 1000, then follow up later to check that +there was success? + diff --git a/notes/ingest/2020-03-02_ingests.txt b/notes/ingest/2020-03-02_ingests.txt new file mode 100644 index 0000000..e98ef33 --- /dev/null +++ b/notes/ingest/2020-03-02_ingests.txt @@ -0,0 +1,174 @@ + +## protocols.io + +Tested that single ingest is working, and they fixed PDF format on their end +recently. + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --name protocols.io + => Expecting 8448 release objects in search queries + => Counter({'estimate': 8448, 'kafka': 8448, 'ingest_request': 8448, 'elasticsearch_release': 8448}) + +## backfill follow-ups + +- re-ingest all degruyter (doi_prefix:10.1515) + 89942 doi:10.1515\/* is_oa:true + 36350 doi:10.1515\/* in_ia:false is_oa:true + 40034 publisher:Gruyter is_oa:true in_ia:false + => update: + 135926 doi:10.1515\/* is_oa:true + 50544 doi:10.1515\/* in_ia:false is_oa:true + 54880 publisher:Gruyter is_oa:true in_ia:false +- re-ingest all frontiersin + 36093 publisher:frontiers is_oa:true in_ia:false + => update + 22444 publisher:frontiers is_oa:true in_ia:false + 22029 doi_prefix:10.3389 is_oa:true in_ia:false + + select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3389/%' group by status order by count(*) desc; + + status | count + -------------------------------------+------- + success | 34721 + no-pdf-link | 18157 + terminal-bad-status | 6799 + cdx-error | 1805 + wayback-error | 333 + no-capture | 301 + [...] + + select * from ingest_file_result where base_url like 'https://doi.org/10.17723/aarc%' and status = 'no-pdf-link' order by updated desc limit 100; + +- re-ingest all mdpi + 43114 publisher:mdpi is_oa:true in_ia:false + => update + 8548 publisher:mdpi is_oa:true in_ia:false + + select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3390/%' group by status order by count(*) desc; + status | count + -------------------------------------+-------- + success | 108971 + cdx-error | 6655 + wrong-mimetype | 3359 + terminal-bad-status | 1299 + wayback-error | 151 + spn2-cdx-lookup-failure | 87 + + => added hack for gzip content-encoding coming through pdf fetch + => will re-ingest all after pushing fix + +- re-ingest all ahajournals.org + 132000 doi:10.1161\/* + 6606 doi:10.1161\/* in_ia:false is_oa:true + 81349 publisher:"American Heart Association" + 5986 publisher:"American Heart Association" is_oa:true in_ia:false + => update + 1337 publisher:"American Heart Association" is_oa:true in_ia:false + + status | count + -------------------------------------+------- + success | 1480 + cdx-error | 1176 + spn2-cdx-lookup-failure | 514 + no-pdf-link | 85 + wayback-error | 25 + spn2-error:job-failed | 18 + + => will re-run errors +- re-ingest all ehp.niehs.nih.gov + 25522 doi:10.1289\/* + 15315 publisher:"Environmental Health Perspectives" + 8779 publisher:"Environmental Health Perspectives" in_ia:false + 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true + => update + 7547 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true +- re-ingest all journals.tsu.ru + 12232 publisher:"Tomsk State University" + 11668 doi:10.17223\/* + 4861 publisher:"Tomsk State University" in_ia:false is_oa:true + => update + 2605 publisher:"Tomsk State University" in_ia:false is_oa:true + => just need to retry these? seem fine +- re-ingest all www.cogentoa.com + 3421898 doi:10.1080\/* + 4602 journal:cogent is_oa:true in_ia:false + 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain) + => update + 254 journal:cogent is_oa:true in_ia:false +- re-ingest chemrxiv + 8281 doi:10.26434\/chemrxiv* + 6918 doi:10.26434\/chemrxiv* in_ia:false + => update + 4890 doi:10.26434\/chemrxiv* in_ia:false + => re-ingest + => allow non-OA + + # american archivist + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) + => 2020-02-04: 85 / 3,005 + => 2020-03-02: 2,182 / 3,005 preserved. some no-pdf-link, otherwise just a bunch of spn2-error + => looks like the no-pdf-url due to pinnacle-secure.allenpress.com soft-blocking loop + + +## backfill re-ingests + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl container --container-id zpobyv4vbranllc7oob56tgci4 + => Counter({'elasticsearch_release': 823, 'estimate': 823, 'ingest_request': 814, 'kafka': 814}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + => Counter({'elasticsearch_release': 54880, 'estimate': 54880, 'kafka': 51497, 'ingest_request': 51497}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query 'publisher:"Tomsk State University"' + => Counter({'ingest_request': 2605, 'kafka': 2605, 'elasticsearch_release': 2605, 'estimate': 2605}) + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + => Counter({'estimate': 8548, 'elasticsearch_release': 8548, 'ingest_request': 6693, 'kafka': 6693}) + => NOTE: about 2k not enqueued + +## re-ingest all broken + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-%' + ) TO '/grande/snapshots/reingest_spn2_20200302.rows.json'; + => COPY 14849 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + ) TO '/grande/snapshots/reingest_cdxerr_20200302.rows.json'; + => COPY 507610 + + This is a huge number! Re-ingest via bulk? + +Transform: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2_20200302.rows.json > reingest_spn2_20200302.json + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdxerr_20200302.rows.json > reingest_cdxerr_20200302.json + +Push to kafka: + + cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + # accidentially also piped the above through ingest-file-requests-bulk... + # which could actually be bad + cat reingest_cdxerr_20200302.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## biorxiv/medrxiv + + 8026 doi:10.1101\/20* + 2159 doi:10.1101\/20* in_ia:false + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 'doi:10.1101\/20* in_ia:false' + => Counter({'estimate': 2159, 'ingest_request': 2159, 'elasticsearch_release': 2159, 'kafka': 2159}) + diff --git a/notes/tasks/2020-02-04_ingest_backfills.md b/notes/tasks/2020-02-04_ingest_backfills.md deleted file mode 100644 index 73a42ef..0000000 --- a/notes/tasks/2020-02-04_ingest_backfills.md +++ /dev/null @@ -1,148 +0,0 @@ - - -## Using Fatcat Tool - -Want to enqueue some backfill URLs to crawl, now that SPNv2 is on the mend. - -Example dry-run: - - ./fatcat_ingest.py --dry-run --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife - -Big OA from 2020 (past month): - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 158 release objects in search queries - Counter({'ingest_request': 158, 'estimate': 158, 'kafka': 158, 'elasticsearch_release': 158}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name elife - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 2312 release objects in search queries - Counter({'kafka': 2312, 'ingest_request': 2312, 'elasticsearch_release': 2312, 'estimate': 2312}) - - # note: did 100 first to test - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name plos - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 1185 release objects in search queries - Counter({'estimate': 1185, 'ingest_request': 1185, 'elasticsearch_release': 1185, 'kafka': 1185}) - - ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 89 release objects in search queries - Counter({'elasticsearch_release': 89, 'estimate': 89, 'ingest_request': 89, 'kafka': 89}) - - ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher ieee - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 499 release objects in search queries - Counter({'kafka': 499, 'ingest_request': 499, 'estimate': 499, 'elasticsearch_release': 499}) - - ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name bmj - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 28 release objects in search queries - Counter({'elasticsearch_release': 28, 'ingest_request': 28, 'kafka': 28, 'estimate': 28}) - - ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 6225 release objects in search queries - Counter({'estimate': 6225, 'kafka': 500, 'elasticsearch_release': 500, 'ingest_request': 500}) - - ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 2920 release objects in search queries - Counter({'estimate': 2920, 'elasticsearch_release': 1001, 'ingest_request': 1000, 'kafka': 1000}) - -Hip corona virus papers: - - ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 5332 release objects in search queries - Counter({'estimate': 5332, 'elasticsearch_release': 2159, 'ingest_request': 2000, 'kafka': 2000}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 110 release objects in search queries - Counter({'ingest_request': 110, 'kafka': 110, 'elasticsearch_release': 110, 'estimate': 110}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV - Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests - Expecting 589 release objects in search queries - Counter({'estimate': 589, 'elasticsearch_release': 589, 'ingest_request': 552, 'kafka': 552}) - - -Mixed eLife results: - - ["wrong-mimetype",null,"https://elifesciences.org/articles/54551"] - ["success",null,"https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNTE2OTEvZWxpZmUtNTE2OTEtdjEucGRm/elife-51691-v1.pdf?_hash=Jp1cLog1NzIlU%2BvjgLdbM%2BuphOwe5QWUn%2F97tbQBNG4%3D"] - -## Re-Request Failed - -Select some failed injest request rows to re-enqueue: - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status = 'spn2-cdx-lookup-failure' - ) TO '/grande/snapshots/reingest_spn2cdx_20200205.rows.json'; - -- 1536 rows - -Transform back to full requests: - - ./scripts/ingestrequest_row2json.py reingest_spn2cdx_20200205.rows.json > reingest_spn2cdx_20200205.json - -Push into kafka (on a kafka broker node): - - cat ~/reingest_spn2cdx_20200205.json | jq . -c | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests -p -1 - -More: - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'error:%' - ) TO '/grande/snapshots/reingest_spn2err1_20200205.rows.json'; - -- COPY 1516 - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'spn2-error%' - ) TO '/grande/snapshots/reingest_spn2err2_20200205.rows.json'; - -- COPY 16678 - -The next large ones to try would be `wayback-error` and `cdx-error`, though -these are pretty generic. Could go kafka output to try and understand those -error classes better. - -Oof, as a mistake enqueued to partition 1 instead of -1 (random), so these will -take a week or more to actually process. Re-enqueued as -1; ingesting from -wayback is pretty fast, this should result mostly wayback ingests. Caught up by -end of weekend? - -## Check Coverages - -As follow-ups: - - elife: https://fatcat.wiki/container/en4qj5ijrbf5djxx7p5zzpjyoq/coverage - => 2020-02-24: 7187 / 8101 = 88% preserved - archivist: https://fatcat.wiki/container/zpobyv4vbranllc7oob56tgci4/coverage - => 85 preserved - => 2020-02-24: 85 / 3005 preserved (TODO) - jcancer: https://fatcat.wiki/container/nkkzpwht7jd3zdftc6gq4eoeey/coverage - => 2020 preserved - => 2520 preserved - => 2020-02-24: 2700 / 2766 preserved - plos: https://fatcat.wiki/container/23nqq3odsjhmbi5tqavvcn7cfm/coverage - => 2020-02-24: 7580 / 7730 = 98% preserved - diff --git a/notes/tasks/2020-02-18_ingest_backfills.md b/notes/tasks/2020-02-18_ingest_backfills.md deleted file mode 100644 index 1ab18f4..0000000 --- a/notes/tasks/2020-02-18_ingest_backfills.md +++ /dev/null @@ -1,42 +0,0 @@ - -Select: - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'spn2-error%' - ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json'; - => COPY 6537 - - COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'wayback-error' - ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json'; - => COPY 33022 - -Transform: - - ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json - ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json - -Push to kafka: - - cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 - cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 - -Many had null `ingest_request_source`, so won't actually import into fatcat: - - bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n - 1 "savepapernow-web" - 112 "fatcat-ingest-container" - 11750 "fatcat-changelog" - 21159 null - diff --git a/notes/tasks/2020-02-21_ingest_backfills.md b/notes/tasks/2020-02-21_ingest_backfills.md deleted file mode 100644 index 48df910..0000000 --- a/notes/tasks/2020-02-21_ingest_backfills.md +++ /dev/null @@ -1,104 +0,0 @@ - -Follow-ups to last ingest backfill. Only run these when ingest request topic is -empty, and full persist chain has run successfully. - -## Corona virus stuff - - ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV - -## Large OA Publishers - -Should probably check domain stats/success for all of these first. - -Would also be good to have a "randomize" option. Could fake that by dumping to -disk first. - - ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier - - ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer - - # ??? - ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 - -## Fixed OA Publishers (small tests) - - # american archivist - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 - => Expecting 2920 release objects in search queries - => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25}) - => good - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter - => Expecting 42897 release objects in search queries - => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers - => Expecting 35427 release objects in search queries - => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) - => mixed results? - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi - => Expecting 43111 release objects in search queries - => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25}) - => success, fast - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association" - => Expecting 185240 release objects in search queries - => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25}) - => no success? or mixed? skip for now - - # Environmental Health Perspectives (NIH) - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky - => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] - => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] - => FIXED - => good (but slow?) - - ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University" - => Expecting 578057 release objects in search queries - => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50}) - => nothing from tsu.ru? skip for now - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" - => Expecting 4602 release objects in search queries - => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) - => good - - ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" - => Expecting 5690 release objects in search queries - => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) - => good - - -## Fixed OA Publishers (full runs) - - # american archivist - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 - Expecting 2920 release objects in search queries - Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter - Expecting 42986 release objects in search queries - Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi - Expecting 43108 release objects in search queries - Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262}) - - # Environmental Health Perspectives (NIH) - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky - Expecting 12699 release objects in search queries - Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" - Expecting 4602 release objects in search queries - Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602}) - - ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" - Expecting 5690 release objects in search queries - Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690}) - -- cgit v1.2.3