diff options
Diffstat (limited to 'notes/ingest')
35 files changed, 9232 insertions, 0 deletions
diff --git a/notes/ingest/.gitignore b/notes/ingest/.gitignore new file mode 100644 index 0000000..343a25c --- /dev/null +++ b/notes/ingest/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.json diff --git a/notes/ingest/2019-10-23_testing.md b/notes/ingest/2019-10-23_testing.md new file mode 100644 index 0000000..481c4e2 --- /dev/null +++ b/notes/ingest/2019-10-23_testing.md @@ -0,0 +1,8 @@ + +exported not-archived DOIs for elife, as well as general list. + + wc -l recent\ missing\ oa\ releases.csv + 161828 recent missing oa releases.csv + + wc -l missing\ elife\ DOIs.csv + 1779 missing elife DOIs.csv diff --git a/notes/ingest/2020-01-14_bulk.md b/notes/ingest/2020-01-14_bulk.md new file mode 100644 index 0000000..9d05cda --- /dev/null +++ b/notes/ingest/2020-01-14_bulk.md @@ -0,0 +1,26 @@ + +Generate ingest requests from arabesque: + + zcat /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source arxiv --extid-type arxiv --release-stage submitted - | shuf > /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json + + zcat /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.json.gz | ./arabesque2ingestrequest.py --link-source pmc --extid-type pmcid - | shuf > /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json + + +Quick tests locally: + + time head -n100 /data/arabesque/ARXIV-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_arxiv.json + time head -n100 /data/arabesque/PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json |./ingest_file.py requests - > sample_pubmed.json + +These are all wayback success; looking good! Single threaded, from home laptop +(over tunnel), took about 9 minutes, or 5.5sec/pdf. That's pretty slow even +with 30x parallelism. Should re-test on actual server. GROBID pre-check should +help? + +With new bulk topic: + + head PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Ok, let them rip: + + cat PUBMEDCENTRAL-CRAWL-2019-10.arabesque.ingest_request.json -n1000 | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ARXIV-CRAWL-2019-10.arabesque.ingest_request.json | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2020-02-04_ingest_backfills.md b/notes/ingest/2020-02-04_ingest_backfills.md new file mode 100644 index 0000000..73a42ef --- /dev/null +++ b/notes/ingest/2020-02-04_ingest_backfills.md @@ -0,0 +1,148 @@ + + +## Using Fatcat Tool + +Want to enqueue some backfill URLs to crawl, now that SPNv2 is on the mend. + +Example dry-run: + + ./fatcat_ingest.py --dry-run --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife + +Big OA from 2020 (past month): + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name elife + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 158 release objects in search queries + Counter({'ingest_request': 158, 'estimate': 158, 'kafka': 158, 'elasticsearch_release': 158}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name elife + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 2312 release objects in search queries + Counter({'kafka': 2312, 'ingest_request': 2312, 'elasticsearch_release': 2312, 'estimate': 2312}) + + # note: did 100 first to test + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name plos + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 1185 release objects in search queries + Counter({'estimate': 1185, 'ingest_request': 1185, 'elasticsearch_release': 1185, 'kafka': 1185}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 89 release objects in search queries + Counter({'elasticsearch_release': 89, 'estimate': 89, 'ingest_request': 89, 'kafka': 89}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher ieee + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 499 release objects in search queries + Counter({'kafka': 499, 'ingest_request': 499, 'estimate': 499, 'elasticsearch_release': 499}) + + ./fatcat_ingest.py --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --name bmj + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 28 release objects in search queries + Counter({'elasticsearch_release': 28, 'ingest_request': 28, 'kafka': 28, 'estimate': 28}) + + ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 6225 release objects in search queries + Counter({'estimate': 6225, 'kafka': 500, 'elasticsearch_release': 500, 'ingest_request': 500}) + + ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 2920 release objects in search queries + Counter({'estimate': 2920, 'elasticsearch_release': 1001, 'ingest_request': 1000, 'kafka': 1000}) + +Hip corona virus papers: + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 5332 release objects in search queries + Counter({'estimate': 5332, 'elasticsearch_release': 2159, 'ingest_request': 2000, 'kafka': 2000}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 110 release objects in search queries + Counter({'ingest_request': 110, 'kafka': 110, 'elasticsearch_release': 110, 'estimate': 110}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV + Will send ingest requests to kafka topic: sandcrawler-prod.ingest-file-requests + Expecting 589 release objects in search queries + Counter({'estimate': 589, 'elasticsearch_release': 589, 'ingest_request': 552, 'kafka': 552}) + + +Mixed eLife results: + + ["wrong-mimetype",null,"https://elifesciences.org/articles/54551"] + ["success",null,"https://elifesciences.org/download/aHR0cHM6Ly9jZG4uZWxpZmVzY2llbmNlcy5vcmcvYXJ0aWNsZXMvNTE2OTEvZWxpZmUtNTE2OTEtdjEucGRm/elife-51691-v1.pdf?_hash=Jp1cLog1NzIlU%2BvjgLdbM%2BuphOwe5QWUn%2F97tbQBNG4%3D"] + +## Re-Request Failed + +Select some failed injest request rows to re-enqueue: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) TO '/grande/snapshots/reingest_spn2cdx_20200205.rows.json'; + -- 1536 rows + +Transform back to full requests: + + ./scripts/ingestrequest_row2json.py reingest_spn2cdx_20200205.rows.json > reingest_spn2cdx_20200205.json + +Push into kafka (on a kafka broker node): + + cat ~/reingest_spn2cdx_20200205.json | jq . -c | kafkacat -P -b localhost -t sandcrawler-prod.ingest-file-requests -p -1 + +More: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'error:%' + ) TO '/grande/snapshots/reingest_spn2err1_20200205.rows.json'; + -- COPY 1516 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-error%' + ) TO '/grande/snapshots/reingest_spn2err2_20200205.rows.json'; + -- COPY 16678 + +The next large ones to try would be `wayback-error` and `cdx-error`, though +these are pretty generic. Could go kafka output to try and understand those +error classes better. + +Oof, as a mistake enqueued to partition 1 instead of -1 (random), so these will +take a week or more to actually process. Re-enqueued as -1; ingesting from +wayback is pretty fast, this should result mostly wayback ingests. Caught up by +end of weekend? + +## Check Coverages + +As follow-ups: + + elife: https://fatcat.wiki/container/en4qj5ijrbf5djxx7p5zzpjyoq/coverage + => 2020-02-24: 7187 / 8101 = 88% preserved + archivist: https://fatcat.wiki/container/zpobyv4vbranllc7oob56tgci4/coverage + => 85 preserved + => 2020-02-24: 85 / 3005 preserved (TODO) + jcancer: https://fatcat.wiki/container/nkkzpwht7jd3zdftc6gq4eoeey/coverage + => 2020 preserved + => 2520 preserved + => 2020-02-24: 2700 / 2766 preserved + plos: https://fatcat.wiki/container/23nqq3odsjhmbi5tqavvcn7cfm/coverage + => 2020-02-24: 7580 / 7730 = 98% preserved + diff --git a/notes/ingest/2020-02-18_ingest_backfills.md b/notes/ingest/2020-02-18_ingest_backfills.md new file mode 100644 index 0000000..1ab18f4 --- /dev/null +++ b/notes/ingest/2020-02-18_ingest_backfills.md @@ -0,0 +1,42 @@ + +Select: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '2 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-error%' + ) TO '/grande/snapshots/reingest_spn2err_20200218.rows.json'; + => COPY 6537 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'wayback-error' + ) TO '/grande/snapshots/reingest_waybackerr_20200218.rows.json'; + => COPY 33022 + +Transform: + + ./scripts/ingestrequest_row2json.py reingest_spn2err_20200218.rows.json > reingest_spn2err_20200218.json + ./scripts/ingestrequest_row2json.py reingest_waybackerr_20200218.rows.json > reingest_waybackerr_20200218.json + +Push to kafka: + + cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_waybackerr_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +Many had null `ingest_request_source`, so won't actually import into fatcat: + + bnewbold@ia601101$ cat reingest_waybackerr_20200218.json | jq .ingest_request_source | sort | uniq -c | sort -n + 1 "savepapernow-web" + 112 "fatcat-ingest-container" + 11750 "fatcat-changelog" + 21159 null + diff --git a/notes/ingest/2020-02-21_ingest_backfills.md b/notes/ingest/2020-02-21_ingest_backfills.md new file mode 100644 index 0000000..48df910 --- /dev/null +++ b/notes/ingest/2020-02-21_ingest_backfills.md @@ -0,0 +1,104 @@ + +Follow-ups to last ingest backfill. Only run these when ingest request topic is +empty, and full persist chain has run successfully. + +## Corona virus stuff + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query coronavirus + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 2019-nCoV + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query MERS-CoV + +## Large OA Publishers + +Should probably check domain stats/success for all of these first. + +Would also be good to have a "randomize" option. Could fake that by dumping to +disk first. + + ./fatcat_ingest.py --limit 2000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher elsevier + + ./fatcat_ingest.py --dry-run --limit 500 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --after-year 2020 container --publisher springer + + # ??? + ./fatcat_ingest.py --limit 1000 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + +## Fixed OA Publishers (small tests) + + # american archivist + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + => Expecting 2920 release objects in search queries + => Counter({'estimate': 2920, 'elasticsearch_release': 26, 'ingest_request': 25, 'kafka': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + => Expecting 42897 release objects in search queries + => Counter({'estimate': 42897, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher frontiers + => Expecting 35427 release objects in search queries + => Counter({'estimate': 35427, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => mixed results? + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + => Expecting 43111 release objects in search queries + => Counter({'estimate': 43111, 'elasticsearch_release': 25, 'ingest_request': 25, 'kafka': 25}) + => success, fast + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "American Heart Association" + => Expecting 185240 release objects in search queries + => Counter({'estimate': 185240, 'kafka': 25, 'ingest_request': 25, 'elasticsearch_release': 25}) + => no success? or mixed? skip for now + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => ["no-pdf-link",null,"https://ehp.niehs.nih.gov/doi/10.1289/ehp.113-a51"] + => FIXED + => good (but slow?) + + ./fatcat_ingest.py --limit 50 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher "Tomsk State University" + => Expecting 578057 release objects in search queries + => Counter({'estimate': 578057, 'elasticsearch_release': 50, 'kafka': 50, 'ingest_request': 50}) + => nothing from tsu.ru? skip for now + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + => Expecting 4602 release objects in search queries + => Counter({'estimate': 4602, 'kafka': 25, 'elasticsearch_release': 25, 'ingest_request': 25}) + => good + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + => Expecting 5690 release objects in search queries + => Counter({'estimate': 5690, 'ingest_request': 25, 'kafka': 25, 'elasticsearch_release': 25}) + => good + + +## Fixed OA Publishers (full runs) + + # american archivist + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Expecting 2920 release objects in search queries + Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + Expecting 42986 release objects in search queries + Counter({'estimate': 42986, 'elasticsearch_release': 42986, 'kafka': 42935, 'ingest_request': 42935}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + Expecting 43108 release objects in search queries + Counter({'estimate': 43108, 'elasticsearch_release': 43108, 'ingest_request': 41262, 'kafka': 41262}) + + # Environmental Health Perspectives (NIH) + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --container-id 3w6amv3ecja7fa3ext35ndpiky + Expecting 12699 release objects in search queries + Counter({'elasticsearch_release': 12699, 'estimate': 12699, 'kafka': 12615, 'ingest_request': 12615}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --name "cogent" + Expecting 4602 release objects in search queries + Counter({'estimate': 4602, 'ingest_request': 4602, 'kafka': 4602, 'elasticsearch_release': 4602}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + Expecting 5690 release objects in search queries + Counter({'ingest_request': 5690, 'kafka': 5690, 'estimate': 5690, 'elasticsearch_release': 5690}) + diff --git a/notes/ingest/2020-02-22_fixed_domain.txt b/notes/ingest/2020-02-22_fixed_domain.txt new file mode 100644 index 0000000..a60de42 --- /dev/null +++ b/notes/ingest/2020-02-22_fixed_domain.txt @@ -0,0 +1,246 @@ + +www.degruyter.com + + "/view/books/" didn't have citation_pdf_url, so added custom URL rule. + + Not sure why redirect-loop happening, but isn't with current live ingest + tool? + + domain | status | count + -------------------+-------------------------+------- + www.degruyter.com | redirect-loop | 22023 + www.degruyter.com | no-pdf-link | 8773 + www.degruyter.com | no-capture | 8617 + www.degruyter.com | success | 840 + www.degruyter.com | link-loop | 59 + www.degruyter.com | terminal-bad-status | 23 + www.degruyter.com | wrong-mimetype | 12 + www.degruyter.com | spn-error | 4 + www.degruyter.com | spn2-cdx-lookup-failure | 4 + www.degruyter.com | spn2-error:proxy-error | 1 + www.degruyter.com | spn-remote-error | 1 + www.degruyter.com | gateway-timeout | 1 + www.degruyter.com | petabox-error | 1 + (13 rows) + +www.frontiersin.org + + no pdf link + + seems to live ingest fine? files served from "*.blob.core.windows.net" + no fix, just re-ingest. + + domain | status | count + ---------------------+-------------------------+------- + www.frontiersin.org | no-pdf-link | 17503 + www.frontiersin.org | terminal-bad-status | 6696 + www.frontiersin.org | wayback-error | 203 + www.frontiersin.org | no-capture | 20 + www.frontiersin.org | spn-error | 6 + www.frontiersin.org | gateway-timeout | 3 + www.frontiersin.org | wrong-mimetype | 3 + www.frontiersin.org | spn2-cdx-lookup-failure | 2 + www.frontiersin.org | spn2-error:job-failed | 2 + www.frontiersin.org | spn-remote-error | 1 + www.frontiersin.org | cdx-error | 1 + (11 rows) + +www.mdpi.com + + terminal-bad-status + + Seems to ingest fine live? No fix, just re-ingest. + + domain | status | count + --------------+-------------------------+------- + www.mdpi.com | terminal-bad-status | 13866 + www.mdpi.com | wrong-mimetype | 2693 + www.mdpi.com | wayback-error | 513 + www.mdpi.com | redirect-loop | 505 + www.mdpi.com | success | 436 + www.mdpi.com | no-capture | 214 + www.mdpi.com | no-pdf-link | 43 + www.mdpi.com | spn2-cdx-lookup-failure | 34 + www.mdpi.com | gateway-timeout | 3 + www.mdpi.com | petabox-error | 2 + (10 rows) + +www.ahajournals.org | no-pdf-link | 5727 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.ahajournals.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.ahajournals.org%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ---------------------+----------------+------- + www.ahajournals.org | no-pdf-link | 5738 + www.ahajournals.org | wrong-mimetype | 84 + (2 rows) + + + pdf | https://doi.org/10.1161/circ.110.19.2977 | 2020-02-23 00:28:55.256296+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/str.49.suppl_1.tp403 | 2020-02-23 00:27:34.950059+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/str.49.suppl_1.tp168 | 2020-02-23 00:25:54.611271+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + pdf | https://doi.org/10.1161/jaha.119.012131 | 2020-02-23 00:24:44.244511+00 | f | no-pdf-link | https://www.ahajournals.org/action/cookieAbsent | 20200217122952 | 200 | + + Ah, the ol' annoying 'cookieAbsent'. Works with live SPNv2 via soft-404 + detection, but that status wasn't coming through, and needed custom + pdf-link detection. + + FIXED: added pdf-link detection + +ehp.niehs.nih.gov | no-pdf-link | 5772 + + simple custom URL format. but are they also blocking? + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'ehp.niehs.nih.gov' + GROUP BY domain, status + ORDER BY COUNT DESC; + + domain | status | count + -------------------+----------------+------- + ehp.niehs.nih.gov | no-pdf-link | 5791 + ehp.niehs.nih.gov | wrong-mimetype | 11 + (2 rows) + + FIXED: mostly just slow, custom URL seems to work + +journals.tsu.ru | no-pdf-link | 4404 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'journals.tsu.ru' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%journals.tsu.ru%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + -----------------+----------------+------- + journals.tsu.ru | no-pdf-link | 4409 + journals.tsu.ru | success | 1 + journals.tsu.ru | wrong-mimetype | 1 + (3 rows) + + + pdf | https://doi.org/10.17223/18572685/57/3 | 2020-02-23 00:45:49.003593+00 | f | no-pdf-link | http://journals.tsu.ru/rusin/&journal_page=archive&id=1907&article_id=42847 | 20200213132322 | 200 | + pdf | https://doi.org/10.17223/17267080/71/4 | 2020-02-23 00:31:25.715416+00 | f | no-pdf-link | http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405 | 20200211151825 | 200 | + pdf | https://doi.org/10.17223/15617793/399/33 | 2020-02-23 00:29:45.414865+00 | f | no-pdf-link | http://journals.tsu.ru/vestnik/&journal_page=archive&id=1322&article_id=24619 | 20200208152715 | 200 | + pdf | https://doi.org/10.17223/19988613/58/15 | 2020-02-23 00:25:24.402838+00 | f | no-pdf-link | http://journals.tsu.ru//history/&journal_page=archive&id=1827&article_id=40501 | 20200212200320 | 200 | + + FIXED: simple new custom PDF link pattern + +www.cogentoa.com | no-pdf-link | 4282 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'www.cogentoa.com' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%www.cogentoa.com%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + ------------------+-------------+------- + www.cogentoa.com | no-pdf-link | 4296 + (1 row) + + pdf | https://doi.org/10.1080/23311932.2015.1022632 | 2020-02-23 01:06:14.040013+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311932.2015.1022632 | 20200208054228 | 200 | + pdf | https://doi.org/10.1080/23322039.2020.1730079 | 2020-02-23 01:04:53.754117+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23322039.2020.1730079 | 20200223010431 | 200 | + pdf | https://doi.org/10.1080/2331186x.2018.1460901 | 2020-02-23 01:04:03.47563+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/2331186X.2018.1460901 | 20200207200958 | 200 | + pdf | https://doi.org/10.1080/23311975.2017.1412873 | 2020-02-23 01:03:08.063545+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311975.2017.1412873 | 20200209034602 | 200 | + pdf | https://doi.org/10.1080/23311916.2017.1293481 | 2020-02-23 01:02:42.868424+00 | f | no-pdf-link | https://www.cogentoa.com/article/10.1080/23311916.2017.1293481 | 20200208101623 | 200 | + + FIXED: simple custom URL-based pattern + +chemrxiv.org | no-pdf-link | 4186 + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain = 'chemrxiv.org' + GROUP BY domain, status + ORDER BY COUNT DESC; + + SELECT * FROM ingest_file_result + WHERE terminal_url LIKE '%chemrxiv.org%' + AND status = 'no-pdf-link' + ORDER BY updated DESC + LIMIT 10; + + domain | status | count + --------------+-------------------------+------- + chemrxiv.org | no-pdf-link | 4202 + chemrxiv.org | wrong-mimetype | 64 + chemrxiv.org | wayback-error | 14 + chemrxiv.org | success | 12 + chemrxiv.org | terminal-bad-status | 4 + chemrxiv.org | spn2-cdx-lookup-failure | 1 + + pdf | https://doi.org/10.26434/chemrxiv.9912812.v1 | 2020-02-23 01:08:34.585084+00 | f | no-pdf-link | https://chemrxiv.org/articles/Proximity_Effect_in_Crystalline_Framework_Materials_Stacking-Induced_Functionality_in_MOFs_and_COFs/9912812/1 | 20200215072929 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.7150097 | 2020-02-23 01:05:48.957624+00 | f | no-pdf-link | https://chemrxiv.org/articles/Systematic_Engineering_of_a_Protein_Nanocage_for_High-Yield_Site-Specific_Modification/7150097 | 20200213002430 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.7833500.v1 | 2020-02-23 00:55:41.013109+00 | f | no-pdf-link | https://chemrxiv.org/articles/Formation_of_Neutral_Peptide_Aggregates_Studied_by_Mass_Selective_IR_Action_Spectroscopy/7833500/1 | 20200210131343 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.8146103 | 2020-02-23 00:52:00.193328+00 | f | no-pdf-link | https://chemrxiv.org/articles/On-Demand_Guest_Release_from_MOF-5_Sealed_with_Nitrophenylacetic_Acid_Photocapping_Groups/8146103 | 20200207215449 | 200 | + pdf | https://doi.org/10.26434/chemrxiv.10101419 | 2020-02-23 00:46:14.086913+00 | f | no-pdf-link | https://chemrxiv.org/articles/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives_The_Hidden_Nature_of_Dasatinib/10101419 | 20200214044153 | 200 | + + FIXED: complex JSON PDF url extraction; maybe for all figshare? + +TODO: +x many datacite prefixes go to IRs, but have is_oa:false. we should probably crawl by default based on release_type + => fatcat branch bnewbold-more-ingest +- re-ingest all degruyter (doi_prefix:10.1515) + 1456169 doi:10.1515\/* + 89942 doi:10.1515\/* is_oa:true + 36350 doi:10.1515\/* in_ia:false is_oa:true + 1290830 publisher:Gruyter + 88944 publisher:Gruyter is_oa:true + 40034 publisher:Gruyter is_oa:true in_ia:false +- re-ingest all frontiersin + 248165 publisher:frontiers + 161996 publisher:frontiers is_oa:true + 36093 publisher:frontiers is_oa:true in_ia:false + 121001 publisher:frontiers in_ia:false +- re-ingest all mdpi + 43114 publisher:mdpi is_oa:true in_ia:false +- re-ingest all ahajournals.org + 132000 doi:10.1161\/* + 6606 doi:10.1161\/* in_ia:false is_oa:true + 81349 publisher:"American Heart Association" + 5986 publisher:"American Heart Association" is_oa:true in_ia:false +- re-ingest all ehp.niehs.nih.gov + 25522 doi:10.1289\/* + 15315 publisher:"Environmental Health Perspectives" + 8779 publisher:"Environmental Health Perspectives" in_ia:false + 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true +- re-ingest all journals.tsu.ru + 12232 publisher:"Tomsk State University" + 11668 doi:10.17223\/* + 4861 publisher:"Tomsk State University" in_ia:false is_oa:true +- re-ingest all www.cogentoa.com + 3421898 doi:10.1080\/* + 4602 journal:cogent is_oa:true in_ia:false + 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain) +- re-ingest chemrxiv + 8281 doi:10.26434\/chemrxiv* + 6918 doi:10.26434\/chemrxiv* in_ia:false + +Submit all the above with limits of 1000, then follow up later to check that +there was success? + diff --git a/notes/ingest/2020-02_unpaywall.md b/notes/ingest/2020-02_unpaywall.md new file mode 100644 index 0000000..e18a2ff --- /dev/null +++ b/notes/ingest/2020-02_unpaywall.md @@ -0,0 +1,624 @@ + +## Stats and Things + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | jq .oa_locations[].url_for_pdf -r | rg -v ^null | cut -f3 -d/ | sort | uniq -c | sort -nr > top_domains.txt + +## Transform + + zcat unpaywall_snapshot_2019-11-22T074546.jsonl.gz | ./unpaywall2ingestrequest.py - | pv -l > /dev/null + => 22M 1:31:25 [ 4k/s] + +Shard it into batches of roughly 1 million (all are 1098096 +/- 1): + + zcat unpaywall_snapshot_2019-11-22.ingest_request.shuf.json.gz | split -n r/20 -d - unpaywall_snapshot_2019-11-22.ingest_request.split_ --additional-suffix=.json + +Test ingest: + + head -n200 unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Add a single batch like: + + cat unpaywall_snapshot_2019-11-22.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Progress/Status + +There are 21,961,928 lines total, in batches of 1,098,097. + + unpaywall_snapshot_2019-11-22.ingest_request.split_00.json + => 2020-02-24 21:05 local: 1,097,523 ~22 results/sec (combined) + => 2020-02-25 10:35 local: 0 + unpaywall_snapshot_2019-11-22.ingest_request.split_01.json + unpaywall_snapshot_2019-11-22.ingest_request.split_02.json + unpaywall_snapshot_2019-11-22.ingest_request.split_03.json + unpaywall_snapshot_2019-11-22.ingest_request.split_04.json + => 2020-02-25 11:26 local: 4,388,997 + => 2020-02-25 10:14 local: 1,115,821 + => 2020-02-26 16:00 local: 265,116 + unpaywall_snapshot_2019-11-22.ingest_request.split_05.json + unpaywall_snapshot_2019-11-22.ingest_request.split_06.json + unpaywall_snapshot_2019-11-22.ingest_request.split_07.json + unpaywall_snapshot_2019-11-22.ingest_request.split_08.json + unpaywall_snapshot_2019-11-22.ingest_request.split_09.json + => 2020-02-26 16:01 local: 6,843,708 + => 2020-02-26 16:31 local: 4,839,618 + => 2020-02-28 10:30 local: 2,619,319 + unpaywall_snapshot_2019-11-22.ingest_request.split_10.json + unpaywall_snapshot_2019-11-22.ingest_request.split_11.json + unpaywall_snapshot_2019-11-22.ingest_request.split_12.json + unpaywall_snapshot_2019-11-22.ingest_request.split_13.json + unpaywall_snapshot_2019-11-22.ingest_request.split_14.json + unpaywall_snapshot_2019-11-22.ingest_request.split_15.json + unpaywall_snapshot_2019-11-22.ingest_request.split_16.json + unpaywall_snapshot_2019-11-22.ingest_request.split_17.json + unpaywall_snapshot_2019-11-22.ingest_request.split_18.json + unpaywall_snapshot_2019-11-22.ingest_request.split_19.json + => 2020-02-28 10:50 local: 13,551,887 + => 2020-03-01 23:38 local: 4,521,076 + => 2020-03-02 10:45 local: 2,827,071 + => 2020-03-02 21:06 local: 1,257,176 + added about 500k bulk re-ingest to try and work around cdx errors + => 2020-03-02 21:30 local: 1,733,654 + +## Investigate Failures + +Guessing than some domains are ultimately going to need direct "recrawl" via +SPNv2. + + -- top domain failures for unpaywall GWB history ingest + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + watermark.silverchair.com | terminal-bad-status | 258432 + www.tandfonline.com | no-pdf-link | 203873 + journals.sagepub.com | no-pdf-link | 126317 + iopscience.iop.org | terminal-bad-status | 112526 + files-journal-api.frontiersin.org | terminal-bad-status | 112499 + pubs.acs.org | no-pdf-link | 94772 + www.degruyter.com | redirect-loop | 89801 + www.ahajournals.org | no-pdf-link | 84025 + society.kisti.re.kr | no-pdf-link | 72849 + www.nature.com | redirect-loop | 53575 + babel.hathitrust.org | terminal-bad-status | 41063 + www.ncbi.nlm.nih.gov | redirect-loop | 40363 + scialert.net | no-pdf-link | 38340 + www.degruyter.com | terminal-bad-status | 34913 + www.journal.csj.jp | no-pdf-link | 30881 + espace.library.uq.edu.au | redirect-loop | 24570 + www.jci.org | redirect-loop | 24409 + aip.scitation.org | wrong-mimetype | 22144 + www.vr-elibrary.de | no-pdf-link | 17436 + www.biorxiv.org | wrong-mimetype | 15524 + ajph.aphapublications.org | no-pdf-link | 15083 + zookeys.pensoft.net | redirect-loop | 14867 + dialnet.unirioja.es | redirect-loop | 14486 + asa.scitation.org | wrong-mimetype | 14261 + www.nrcresearchpress.com | no-pdf-link | 14254 + dl.acm.org | redirect-loop | 14223 + osf.io | redirect-loop | 14103 + www.oecd-ilibrary.org | redirect-loop | 12835 + journals.sagepub.com | redirect-loop | 12229 + iopscience.iop.org | redirect-loop | 11825 + (30 rows) + + -- top no-capture terminal domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + => very few from any domain, interesting. Guess many of these are URLs that have truely never been crawled + + -- top no-capture base domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+------------+-------- + academic.oup.com | no-capture | 429888 + www.nature.com | no-capture | 273825 + dergipark.org.tr | no-capture | 119847 + www.biodiversitylibrary.org | no-capture | 110220 + escholarship.org | no-capture | 106307 + onlinelibrary.wiley.com | no-capture | 89771 + journals.sagepub.com | no-capture | 79297 + www.cell.com | no-capture | 64242 + deepblue.lib.umich.edu | no-capture | 58080 + babel.hathitrust.org | no-capture | 52286 + hal.archives-ouvertes.fr | no-capture | 48549 + iopscience.iop.org | no-capture | 42591 + dash.harvard.edu | no-capture | 40767 + www.tandfonline.com | no-capture | 40638 + discovery.ucl.ac.uk | no-capture | 40633 + www.jstage.jst.go.jp | no-capture | 39780 + www.doiserbia.nb.rs | no-capture | 39261 + dspace.mit.edu | no-capture | 37703 + zookeys.pensoft.net | no-capture | 34562 + repositorio.unesp.br | no-capture | 34437 + ashpublications.org | no-capture | 34112 + www.cambridge.org | no-capture | 33959 + kclpure.kcl.ac.uk | no-capture | 31455 + society.kisti.re.kr | no-capture | 30427 + pure.mpg.de | no-capture | 27650 + download.atlantis-press.com | no-capture | 27253 + dialnet.unirioja.es | no-capture | 26886 + link.springer.com | no-capture | 26257 + www.valueinhealthjournal.com | no-capture | 24798 + dspace.library.uu.nl | no-capture | 23234 + (30 rows) + + -- top no-capture base domains + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+------------+-------- + academic.oup.com | no-capture | 429888 + www.nature.com | no-capture | 273825 + dergipark.org.tr | no-capture | 119847 + www.biodiversitylibrary.org | no-capture | 110220 + escholarship.org | no-capture | 106307 + onlinelibrary.wiley.com | no-capture | 89771 + journals.sagepub.com | no-capture | 79297 + www.cell.com | no-capture | 64242 + deepblue.lib.umich.edu | no-capture | 58080 + babel.hathitrust.org | no-capture | 52286 + hal.archives-ouvertes.fr | no-capture | 48549 + iopscience.iop.org | no-capture | 42591 + dash.harvard.edu | no-capture | 40767 + www.tandfonline.com | no-capture | 40638 + discovery.ucl.ac.uk | no-capture | 40633 + www.jstage.jst.go.jp | no-capture | 39780 + www.doiserbia.nb.rs | no-capture | 39261 + dspace.mit.edu | no-capture | 37703 + zookeys.pensoft.net | no-capture | 34562 + repositorio.unesp.br | no-capture | 34437 + ashpublications.org | no-capture | 34112 + www.cambridge.org | no-capture | 33959 + kclpure.kcl.ac.uk | no-capture | 31455 + society.kisti.re.kr | no-capture | 30427 + pure.mpg.de | no-capture | 27650 + download.atlantis-press.com | no-capture | 27253 + dialnet.unirioja.es | no-capture | 26886 + link.springer.com | no-capture | 26257 + www.valueinhealthjournal.com | no-capture | 24798 + dspace.library.uu.nl | no-capture | 23234 + (30 rows) + + -- how many ingest requests not crawled at all? + SELECT count(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status IS NULL; + => 0 + + -- "cookie absent" terminal pages, by domain + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + --------------------------------+----------------+-------- + journals.sagepub.com | no-pdf-link | 126295 + www.tandfonline.com | no-pdf-link | 116690 + pubs.acs.org | no-pdf-link | 94619 + www.ahajournals.org | no-pdf-link | 84016 + www.journal.csj.jp | no-pdf-link | 30881 + aip.scitation.org | wrong-mimetype | 22143 + www.vr-elibrary.de | no-pdf-link | 17436 + ajph.aphapublications.org | no-pdf-link | 15080 + asa.scitation.org | wrong-mimetype | 14261 + www.nrcresearchpress.com | no-pdf-link | 14253 + journals.ametsoc.org | no-pdf-link | 10500 + www.journals.uchicago.edu | no-pdf-link | 6917 + www.icevirtuallibrary.com | no-pdf-link | 6484 + www.journals.uchicago.edu | wrong-mimetype | 6191 + www.healthaffairs.org | no-pdf-link | 5732 + pubsonline.informs.org | no-pdf-link | 5672 + pinnacle-secure.allenpress.com | no-pdf-link | 5013 + www.worldscientific.com | no-pdf-link | 4560 + www.ajronline.org | wrong-mimetype | 4523 + ehp.niehs.nih.gov | no-pdf-link | 4514 + www.future-science.com | no-pdf-link | 4091 + pubs.acs.org | wrong-mimetype | 4015 + aip.scitation.org | no-pdf-link | 3916 + www.futuremedicine.com | no-pdf-link | 3821 + asa.scitation.org | no-pdf-link | 3644 + www.liebertpub.com | no-pdf-link | 3345 + physicstoday.scitation.org | no-pdf-link | 3005 + pubs.cif-ifc.org | no-pdf-link | 2761 + epubs.siam.org | wrong-mimetype | 2583 + www.ajronline.org | no-pdf-link | 2563 + (30 rows) + + -- "cookie absent" terminal pages, by domain + SELECT count(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent'; + + => 654885 + + -- NOT "cookie absent" terminal page failures, total count + SELECT count(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent'; + + => 1403837 + +Looks like these domains are almost all "cookieAbsent" blocking: +- journals.sagepub.com +- pubs.acs.org +- ahajournals.org +- www.journal.csj.jp +- aip.scitation.org + +Grab some individual URLs to test: + + SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ORDER BY updated DESC + LIMIT 25; + +NOT cookieAbsent testing with regular ingest tool: +- iopscience.iop.org, terminal-bad-status, SPNv2 fetch, success +- academic.oup.com => silverchair, terminal-bad-status, SPNv2 fetch, succes +- osf.io success + + SELECT ingest_file_result.status, ingest_file_result.base_url, ingest_file_result.terminal_url + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ORDER BY updated DESC + LIMIT 25; + +cookieAbsent testing with regular ingest tool: +- www.tandfonline.com failure (no-pdf-link via wayback), but force-recrawl works + +The main distinguisher is status. terminal-bad-status can be ingested (live) +successfully, while no-pdf-link, redirect-loop, etc need to be re-crawled. + +## Heritrix Plan + +Generate following ingest request batches: + +- no-capture status from unpaywall +- all other failures except /cookieAbsent +- /cookieAbsent failures + +Plan will be to crawl no-capture first (to completion), then try the other +non-/cookieAbsent failures. /cookieAbsent means we'll need to use SPNv2. + +Because there are so few "no-capture on second hop" cases, will not enqueue +both terminal urls and base urls, only base urls. + +Should definitely skip/filter: + +- www.ncbi.nlm.nih.gov + +## Ingest Request Export + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_20200304.rows.json'; + => 4,855,142 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json'; + => 1,403,837 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200304.rows.json > unpaywall_nocapture_20200304.json + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_fail_nocookie_20200304.rows.json > unpaywall_fail_nocookie_20200304.json + +Note: will probably end up re-running the below after crawling+ingesting the above: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.status = 'terminal-bad-status' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_cookie_badstatus_20200304.rows.json'; + => 0 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.status != 'terminal-bad-status' + AND ingest_file_result.terminal_url LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_cookie_other_20200304.rows.json'; + => 654,885 + +## Batch Ingest + +Test small batch: + + head -n200 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full batch: + + cat /grande/snapshots/unpaywall_nocapture_20200304.rows.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + # there was a broken line in there, so... + # parse error: Expected separator between values at line 1367873, column 175 + # tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c > /dev/null + tail -n+1367875 /grande/snapshots/unpaywall_nocapture_20200304.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Note that the crawl is not entirely complete and not all CDX seem to have been +loaded, so may need to iterate. About 10% are still "no capture". May want or +need to additionally crawl the terminal URLs, not the base URLs. + +## Post-ingest stats + +Overall status: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 17354494 + no-pdf-link | 1471076 + no-capture | 1135992 + redirect-loop | 837842 + terminal-bad-status | 803081 + cdx-error | 219746 + wrong-mimetype | 100723 + link-loop | 16013 + wayback-error | 12448 + null-body | 9444 + redirects-exceeded | 600 + petabox-error | 411 + bad-redirect | 17 + bad-gzip-encoding | 4 + spn2-cdx-lookup-failure | 3 + gateway-timeout | 1 + spn2-error:job-failed | 1 + spn2-error | 1 + (18 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + academic.oup.com | no-pdf-link | 330211 + watermark.silverchair.com | terminal-bad-status | 324599 + www.tandfonline.com | no-pdf-link | 242724 + journals.sagepub.com | no-pdf-link | 202050 + iopscience.iop.org | terminal-bad-status | 144063 + files-journal-api.frontiersin.org | terminal-bad-status | 121719 + pubs.acs.org | no-pdf-link | 104535 + www.ahajournals.org | no-pdf-link | 102653 + society.kisti.re.kr | no-pdf-link | 101787 + www.degruyter.com | redirect-loop | 95130 + www.nature.com | redirect-loop | 87534 + onlinelibrary.wiley.com | no-pdf-link | 84432 + www.cell.com | redirect-loop | 61496 + www.degruyter.com | terminal-bad-status | 42919 + babel.hathitrust.org | terminal-bad-status | 41813 + www.ncbi.nlm.nih.gov | redirect-loop | 40488 + scialert.net | no-pdf-link | 38341 + ashpublications.org | no-pdf-link | 34889 + dialnet.unirioja.es | terminal-bad-status | 32076 + www.journal.csj.jp | no-pdf-link | 30881 + pure.mpg.de | redirect-loop | 26163 + www.jci.org | redirect-loop | 24701 + espace.library.uq.edu.au | redirect-loop | 24591 + www.valueinhealthjournal.com | redirect-loop | 23740 + www.vr-elibrary.de | no-pdf-link | 23332 + aip.scitation.org | wrong-mimetype | 22144 + osf.io | redirect-loop | 18513 + www.journals.elsevier.com | no-pdf-link | 16710 + www.spandidos-publications.com | redirect-loop | 15711 + www.biorxiv.org | wrong-mimetype | 15513 + (30 rows) + +Dump lists for another iteration of bulk ingest: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_20200323.rows.json'; + => 278,876 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ingest_file_result.status != 'success' + AND ingest_file_result.terminal_url NOT LIKE '%/cookieAbsent' + ) TO '/grande/snapshots/unpaywall_fail_nocookie_20200323.rows.json'; + => + + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nocapture_20200323.rows.json > unpaywall_nocapture_20200323.json + + cat unpaywall_nocapture_20200323.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-03-02_ingests.txt b/notes/ingest/2020-03-02_ingests.txt new file mode 100644 index 0000000..e98ef33 --- /dev/null +++ b/notes/ingest/2020-03-02_ingests.txt @@ -0,0 +1,174 @@ + +## protocols.io + +Tested that single ingest is working, and they fixed PDF format on their end +recently. + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --name protocols.io + => Expecting 8448 release objects in search queries + => Counter({'estimate': 8448, 'kafka': 8448, 'ingest_request': 8448, 'elasticsearch_release': 8448}) + +## backfill follow-ups + +- re-ingest all degruyter (doi_prefix:10.1515) + 89942 doi:10.1515\/* is_oa:true + 36350 doi:10.1515\/* in_ia:false is_oa:true + 40034 publisher:Gruyter is_oa:true in_ia:false + => update: + 135926 doi:10.1515\/* is_oa:true + 50544 doi:10.1515\/* in_ia:false is_oa:true + 54880 publisher:Gruyter is_oa:true in_ia:false +- re-ingest all frontiersin + 36093 publisher:frontiers is_oa:true in_ia:false + => update + 22444 publisher:frontiers is_oa:true in_ia:false + 22029 doi_prefix:10.3389 is_oa:true in_ia:false + + select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3389/%' group by status order by count(*) desc; + + status | count + -------------------------------------+------- + success | 34721 + no-pdf-link | 18157 + terminal-bad-status | 6799 + cdx-error | 1805 + wayback-error | 333 + no-capture | 301 + [...] + + select * from ingest_file_result where base_url like 'https://doi.org/10.17723/aarc%' and status = 'no-pdf-link' order by updated desc limit 100; + +- re-ingest all mdpi + 43114 publisher:mdpi is_oa:true in_ia:false + => update + 8548 publisher:mdpi is_oa:true in_ia:false + + select status, count(*) from ingest_file_result where base_url like 'https://doi.org/10.3390/%' group by status order by count(*) desc; + status | count + -------------------------------------+-------- + success | 108971 + cdx-error | 6655 + wrong-mimetype | 3359 + terminal-bad-status | 1299 + wayback-error | 151 + spn2-cdx-lookup-failure | 87 + + => added hack for gzip content-encoding coming through pdf fetch + => will re-ingest all after pushing fix + +- re-ingest all ahajournals.org + 132000 doi:10.1161\/* + 6606 doi:10.1161\/* in_ia:false is_oa:true + 81349 publisher:"American Heart Association" + 5986 publisher:"American Heart Association" is_oa:true in_ia:false + => update + 1337 publisher:"American Heart Association" is_oa:true in_ia:false + + status | count + -------------------------------------+------- + success | 1480 + cdx-error | 1176 + spn2-cdx-lookup-failure | 514 + no-pdf-link | 85 + wayback-error | 25 + spn2-error:job-failed | 18 + + => will re-run errors +- re-ingest all ehp.niehs.nih.gov + 25522 doi:10.1289\/* + 15315 publisher:"Environmental Health Perspectives" + 8779 publisher:"Environmental Health Perspectives" in_ia:false + 12707 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true + => update + 7547 container_id:3w6amv3ecja7fa3ext35ndpiky in_ia:false is_oa:true +- re-ingest all journals.tsu.ru + 12232 publisher:"Tomsk State University" + 11668 doi:10.17223\/* + 4861 publisher:"Tomsk State University" in_ia:false is_oa:true + => update + 2605 publisher:"Tomsk State University" in_ia:false is_oa:true + => just need to retry these? seem fine +- re-ingest all www.cogentoa.com + 3421898 doi:10.1080\/* + 4602 journal:cogent is_oa:true in_ia:false + 5631 journal:cogent is_oa:true (let's recrawl all from publisher domain) + => update + 254 journal:cogent is_oa:true in_ia:false +- re-ingest chemrxiv + 8281 doi:10.26434\/chemrxiv* + 6918 doi:10.26434\/chemrxiv* in_ia:false + => update + 4890 doi:10.26434\/chemrxiv* in_ia:false + => re-ingest + => allow non-OA + + # american archivist + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa container --container-id zpobyv4vbranllc7oob56tgci4 + Counter({'estimate': 2920, 'elasticsearch_release': 2920, 'kafka': 2911, 'ingest_request': 2911}) + => 2020-02-04: 85 / 3,005 + => 2020-03-02: 2,182 / 3,005 preserved. some no-pdf-link, otherwise just a bunch of spn2-error + => looks like the no-pdf-url due to pinnacle-secure.allenpress.com soft-blocking loop + + +## backfill re-ingests + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl container --container-id zpobyv4vbranllc7oob56tgci4 + => Counter({'elasticsearch_release': 823, 'estimate': 823, 'ingest_request': 814, 'kafka': 814}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher Gruyter + => Counter({'elasticsearch_release': 54880, 'estimate': 54880, 'kafka': 51497, 'ingest_request': 51497}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query 'publisher:"Tomsk State University"' + => Counter({'ingest_request': 2605, 'kafka': 2605, 'elasticsearch_release': 2605, 'estimate': 2605}) + + ./fatcat_ingest.py --limit 25 --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi:10.26434\/chemrxiv*" + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org container --publisher mdpi + => Counter({'estimate': 8548, 'elasticsearch_release': 8548, 'ingest_request': 6693, 'kafka': 6693}) + => NOTE: about 2k not enqueued + +## re-ingest all broken + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-%' + ) TO '/grande/snapshots/reingest_spn2_20200302.rows.json'; + => COPY 14849 + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + ) TO '/grande/snapshots/reingest_cdxerr_20200302.rows.json'; + => COPY 507610 + + This is a huge number! Re-ingest via bulk? + +Transform: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2_20200302.rows.json > reingest_spn2_20200302.json + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdxerr_20200302.rows.json > reingest_cdxerr_20200302.json + +Push to kafka: + + cat reingest_spn2err_20200218.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + # accidentially also piped the above through ingest-file-requests-bulk... + # which could actually be bad + cat reingest_cdxerr_20200302.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## biorxiv/medrxiv + + 8026 doi:10.1101\/20* + 2159 doi:10.1101\/20* in_ia:false + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query 'doi:10.1101\/20* in_ia:false' + => Counter({'estimate': 2159, 'ingest_request': 2159, 'elasticsearch_release': 2159, 'kafka': 2159}) + diff --git a/notes/ingest/2020-03-oa_but_not_marked.md b/notes/ingest/2020-03-oa_but_not_marked.md new file mode 100644 index 0000000..73396bd --- /dev/null +++ b/notes/ingest/2020-03-oa_but_not_marked.md @@ -0,0 +1,25 @@ + +These are large journals with a high fraction of "in IA", but not marked as OA +so not crawling regularly. + +TODO: add things like list of unpaywall ISSN / OA status to try and find more +"practical" / bronze OA + +## First Run + +https://fatcat.wiki/container/vmv647omwrhzzgeclyrnpc4him +https://fatcat.wiki/container/waxwzq3cnbet3cmwccpuk4bel4 +https://fatcat.wiki/container/hjoli2j6qffdpaalkszryuidk4 +https://fatcat.wiki/container/fci57bxfsffvzllbssocnfsr3e +https://fatcat.wiki/container/hd23c57sunhcnar5fbgxsn36lm +https://fatcat.wiki/container/bliguyxhonfb7ghuykxgtg3oqe + +## TODO + +https://fatcat.wiki/container/kn6dhptylrb77b5atyiom5ysjm no-pdf-link (but accessible) +https://fatcat.wiki/container/s7bticdwizdmhll4taefg57jde no-pdf-link (easy?) + +https://fatcat.wiki/container/zm56axre7rgihh5sznxp65np5i large; no-pdf-link? +https://fatcat.wiki/container/eb2lcnpf2zeezkmfckcvxw2pgi huge (20k+), not all OA? +https://fatcat.wiki/container/adgy773dtra3xmrsynghcednqm broken? +https://fatcat.wiki/container/w3gj5mynrnbtndalcc5jnhymym not OA? link-loop diff --git a/notes/ingest/2020-03_mag.md b/notes/ingest/2020-03_mag.md new file mode 100644 index 0000000..428ce05 --- /dev/null +++ b/notes/ingest/2020-03_mag.md @@ -0,0 +1,576 @@ + +Rough plan: + +- run bulk and/or regular ingest requests for just those of AIT partners (200k?) +- persist ingest requests (22 million or so) +- run bulk ingest over 'no status' / 'no match' requests (aka, those not in unpaywall) +- crawl those which are no-capture + + +## Generate Requests + +Newer version of `mag_ingest_request.sh` script requires venv with urlcanon +installed. + +Starting with the 2020-01-23 MAG dump, will generate a full ingest request set +(including DOI `ext_id` when available), with any dominant domains removed (eg, +arxiv.org): + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 | pv -l > ingest_requests_mag-2020-01-23.doi.json + => previously 25.6M + => 25.6M 2:29:43 [2.85k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-01-23 --pmid | pv -l > ingest_requests_mag-2020-01-23.pmid.json + => 4.3M 0:25:45 [2.78k/s] + + export LC_ALL=C + cat ingest_requests_mag-2020-01-23.json | jq -r "[.base_url, .ext_ids.doi] | @tsv" | sort -u -S 4G > ingest_requests_mag-2020-01-23.full.seed_id + + zcat PaperUrls_PaperExtendedAttributes_pdf.txt.gz | wc -l + => 6,504,907 + + zcat PaperUrls_mag_url_pmid.txt.gz | wc -l + => 4,369,832 + + cat ingest_requests_mag-2020-01-23.json | jq .ext_ids.doi -r | rg -a -v '^null$' | wc -l + => previously 15,707,405 + => 15,702,581 + + cat ingest_requests_mag-2020-01-23.pmid.json | jq .base_url -r | rg ' ' | wc -l + => 0 + URL encoding seems to be working + +## Persist Ingest Requests + +First pmid ingest requests, then the all/doi file. The reason to do this order +is that the all/doi file will have some rows with no DOI (and thus no +`ext_id`), while the PMID file will not. + + # small sample + head /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 10, 'skip-result-fields': 10}) + JSON lines pushed: Counter({'total': 10, 'pushed': 10}) + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.pmid.json | ./persist_tool.py ingest-request - + => 4.3M 0:16:46 [4.27k/s] + Worker: Counter({'total': 4295026, 'insert-requests': 4241862, 'update-requests': 0}) + JSON lines pushed: Counter({'total': 4295026, 'pushed': 4295026}) + => hit a bug on first attempt, which is why total/insert results don't match + + cat /schnell/mag/20200123/ingest_requests_mag-2020-01-23.doi.json | ./persist_tool.py ingest-request - + => 25.6M 2:21:54 [3.01k/s] + Worker: Counter({'total': 25596559, 'insert-requests': 21348393, 'update-requests': 0}) + JSON lines pushed: Counter({'pushed': 25596559, 'total': 25596559}) + + +## Crawl/Dupe Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +After just PMID links: + + status | count + ---------------------+--------- + | 3000115 + success | 1126881 + no-capture | 69459 + terminal-bad-status | 30259 + redirect-loop | 11656 + no-pdf-link | 2836 + wrong-mimetype | 1456 + link-loop | 1259 + wayback-error | 1232 + cdx-error | 932 + null-body | 85 + petabox-error | 50 + bad-redirect | 1 + (13 rows) + +After all links: + + SELECT COUNT(*) + FROM ingest_request + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag'; + => 25596563 + + + status | count + ---------------------+---------- + | 21130841 + success | 3915682 + no-capture | 391813 + terminal-bad-status | 76488 + redirect-loop | 44202 + wrong-mimetype | 16418 + no-pdf-link | 10995 + wayback-error | 3679 + cdx-error | 3414 + link-loop | 2098 + null-body | 709 + petabox-error | 221 + bad-gzip-encoding | 2 + bad-redirect | 1 + (14 rows) + +Somewhat more un-ingested than expected. + +Dump requests: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/mag_noingest_20200305.rows.json'; + => COPY 21,130,841 + +Transform and shuf: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_noingest_20200305.rows.json | pv -l | shuf | gzip > /grande/snapshots/mag_noingest_20200305.shuf.json.gz + => 21.1M 0:18:57 [18.6k/s] + +## Bulk Ingest Partner Output + +These are subsets of the full list from potential AIT-S partners; want to run +these through the pipeline before the full batch. Duplication against the full +batch should be minimal. + +Size: + + bnewbold@ia601101$ cat ingest_requests_mag-2020-01-23.cornell.json | jq .ext_ids.doi | rg -v '^null$' | wc -l + 29007 + bnewbold@ia601101$ wc -l ingest_requests_mag-2020-01-23.cornell.json + 34265 ingest_requests_mag-2020-01-23.cornell.json + +Test ingest: + + head -n200 ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full ingests: + + cat ingest_requests_mag-2020-01-23.cornell.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.alberta.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.columbia.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.emory.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat ingest_requests_mag-2020-01-23.stanford.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Bulk Ingest + +Shard it into batches of roughly 1 million: + + cd /grande/snapshots/ + zcat /grande/snapshots/mag_noingest_20200305.shuf.json.gz | split -n r/20 -d - mag_noingest_20200305.ingest_request.split_ --additional-suffix=.json + +Add a single batch like: + + cat mag_noingest_20200305.ingest_request.split_00.json | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + partner ingests (see above) + => 2020-03-05 12:49: 118,396 + 1056543 mag_noingest_20200305.ingest_request.split_00.json + => 2020-03-05 14:34: 1,055,224 + => check on stats/ratios; filter by ingest update time? + 1056542 mag_noingest_20200305.ingest_request.split_01.json + 1056542 mag_noingest_20200305.ingest_request.split_02.json + 1056542 mag_noingest_20200305.ingest_request.split_03.json + 1056542 mag_noingest_20200305.ingest_request.split_04.json + 1056542 mag_noingest_20200305.ingest_request.split_05.json + 1056542 mag_noingest_20200305.ingest_request.split_06.json + 1056542 mag_noingest_20200305.ingest_request.split_07.json + 1056542 mag_noingest_20200305.ingest_request.split_08.json + 1056542 mag_noingest_20200305.ingest_request.split_09.json + => 2020-03-05 18:04: 10,009,297 + => 2020-03-06 16:53: 6,553,946 + 1056542 mag_noingest_20200305.ingest_request.split_10.json + 1056542 mag_noingest_20200305.ingest_request.split_11.json + 1056542 mag_noingest_20200305.ingest_request.split_12.json + 1056542 mag_noingest_20200305.ingest_request.split_13.json + 1056542 mag_noingest_20200305.ingest_request.split_14.json + 1056542 mag_noingest_20200305.ingest_request.split_15.json + 1056542 mag_noingest_20200305.ingest_request.split_16.json + 1056542 mag_noingest_20200305.ingest_request.split_17.json + 1056542 mag_noingest_20200305.ingest_request.split_18.json + 1056542 mag_noingest_20200305.ingest_request.split_19.json + => 2020-03-06 16:59: 17,001,032 + +Stats from bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + ---------------------+---------- + no-capture | 12237193 + success | 11991293 + no-pdf-link | 521691 + redirect-loop | 437192 + terminal-bad-status | 231181 + link-loop | 92633 + cdx-error | 33631 + wrong-mimetype | 28638 + wayback-error | 19651 + null-body | 2682 + petabox-error | 727 + | 47 + bad-redirect | 44 + bad-gzip-encoding | 7 + (14 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + --------------------------------------+---------------------+-------- + dialnet.unirioja.es | redirect-loop | 240967 + onlinelibrary.wiley.com | no-pdf-link | 147696 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 72639 + iopscience.iop.org | terminal-bad-status | 69591 + febs.onlinelibrary.wiley.com | no-pdf-link | 49874 + www.researchgate.net | redirect-loop | 42859 + journals.sagepub.com | no-pdf-link | 27448 + papers.ssrn.com | redirect-loop | 27328 + dialnet.unirioja.es | terminal-bad-status | 20320 + physoc.onlinelibrary.wiley.com | no-pdf-link | 20232 + science.sciencemag.org | link-loop | 17811 + espace.library.uq.edu.au | redirect-loop | 17185 + bpspubs.onlinelibrary.wiley.com | no-pdf-link | 15785 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 15301 + anthrosource.onlinelibrary.wiley.com | no-pdf-link | 13746 + www.tandfonline.com | no-pdf-link | 13303 + aasldpubs.onlinelibrary.wiley.com | no-pdf-link | 11070 + link.springer.com | redirect-loop | 10594 + www.redalyc.org:9081 | no-pdf-link | 10515 + watermark.silverchair.com | terminal-bad-status | 9739 + www.bmj.com | link-loop | 9389 + www.repository.naturalis.nl | redirect-loop | 8213 + bjp.rcpsych.org | link-loop | 8045 + aslopubs.onlinelibrary.wiley.com | no-pdf-link | 7814 + nph.onlinelibrary.wiley.com | no-pdf-link | 7801 + iopscience.iop.org | redirect-loop | 7697 + journals.tubitak.gov.tr | wrong-mimetype | 7159 + www.biorxiv.org | wrong-mimetype | 7067 + www.erudit.org | redirect-loop | 6819 + besjournals.onlinelibrary.wiley.com | no-pdf-link | 6254 + (30 rows) + +Domains to follow-up (eg, sandcrawler ingest tests/tweaks): +- dialnet.unirioja.es | redirect-loop | 240967 +- www.researchgate.net | redirect-loop | 42859 +- www.redalyc.org:9081 | no-pdf-link | 10515 +- www.repository.naturalis.nl | redirect-loop | 8213 +- bjp.rcpsych.org | link-loop | 8045 +- journals.tubitak.gov.tr | wrong-mimetype | 7159 +- www.erudit.org | redirect-loop | 6819 + +The dialnet.unirioja.es ones may be worth re-crawling via heritrix? + +Top uncrawled domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------+------------+-------- + ieeexplore.ieee.org | no-capture | 957835 + link.springer.com | no-capture | 394121 + www.researchgate.net | no-capture | 376974 + cyberleninka.ru | no-capture | 376012 + iopscience.iop.org | no-capture | 348791 + papers.ssrn.com | no-capture | 286860 + dergipark.org.tr | no-capture | 217556 + dialnet.unirioja.es | no-capture | 214398 + academic.oup.com | no-capture | 212364 + www.tandfonline.com | no-capture | 148940 + journals.sagepub.com | no-capture | 144695 + www.papersearch.net | no-capture | 138986 + absimage.aps.org | no-capture | 111976 + apps.dtic.mil | no-capture | 106984 + www.cambridge.org | no-capture | 97533 + www.bmj.com | no-capture | 92437 + bioone.org | no-capture | 87573 + science.sciencemag.org | no-capture | 75723 + shodhganga.inflibnet.ac.in:8080 | no-capture | 75395 + www.jstor.org | no-capture | 73230 + works.bepress.com | no-capture | 68747 + www.scielo.org.co | no-capture | 59650 + hrcak.srce.hr | no-capture | 59332 + muse.jhu.edu | no-capture | 57828 + onlinelibrary.wiley.com | no-capture | 55621 + www.jbc.org | no-capture | 54608 + www.jstage.jst.go.jp | no-capture | 53631 + www.redalyc.org | no-capture | 50406 + lup.lub.lu.se | no-capture | 47469 + www.dtic.mil | no-capture | 41820 + (30 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200313.rows.json'; + => COPY 11714199 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200313.rows.json > /grande/snapshots/mag_nocapture_20200313.json + +## Bulk Ingest of Heritrix Content + +Small sample: + + head -n 1000 mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat mag_nocapture_20200313.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + 2020-04-07 12:19 (pacific): 11,703,871 + +## Post-bulk-ingest + +Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need +to re-try things like cdx-error. + +Current status: + + status | count + -------------------------------+---------- + success | 18491799 + redirect-loop | 1968530 + no-capture | 1373657 + no-pdf-link | 1311842 + link-loop | 1296439 + terminal-bad-status | 627577 + cdx-error | 418278 + wrong-mimetype | 50141 + wayback-error | 37159 + petabox-error | 11249 + null-body | 6295 + gateway-timeout | 3051 + spn2-cdx-lookup-failure | 328 + spn2-error:invalid-url-syntax | 93 + bad-redirect | 75 + | 47 + invalid-host-resolution | 28 + spn2-error | 10 + bad-gzip-encoding | 7 + redirects-exceeded | 2 + (20 rows) + +Lots of cdx-error to retry. + +The no-capture links are probably a mix of domain-blocklist and things that +failed in bulk mode. Will dump and re-attempt them: + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json'; + => 859849 + +What domains are these? + + cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30 + +Let's filter down more: + + cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json + + wc -l mag_nocapture_20200420.rows.filtered.json + 423085 mag_nocapture_20200420.rows.filtered.json + +Ok, enqueue! + + cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Final Stats + +... for this round of ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+---------- + success | 18712849 + redirect-loop | 2008110 + no-pdf-link | 1337012 + link-loop | 1326761 + no-capture | 1030693 + terminal-bad-status | 637143 + gateway-timeout | 193194 + cdx-error | 125907 + spn2-cdx-lookup-failure | 77842 + wrong-mimetype | 50882 + wayback-error | 40278 + invalid-host-resolution | 35201 + petabox-error | 11254 + null-body | 6485 + spn2-error | 1643 + spn2-error:job-failed | 747 + spn2-error:invalid-url-syntax | 325 + spn2-error:soft-time-limit-exceeded | 190 + bad-redirect | 77 + | 47 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + domain | status | count + ---------------------------------+---------------------+-------- + ieeexplore.ieee.org | redirect-loop | 677712 + cyberleninka.ru | link-loop | 308390 + papers.ssrn.com | link-loop | 281804 + ieeexplore.ieee.org | link-loop | 273559 + dialnet.unirioja.es | redirect-loop | 240504 + dialnet.unirioja.es | terminal-bad-status | 232481 + onlinelibrary.wiley.com | no-pdf-link | 220932 + iopscience.iop.org | terminal-bad-status | 172480 + validate.perfdrive.com | no-pdf-link | 172312 + link.springer.com | redirect-loop | 130398 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382 + iopscience.iop.org | redirect-loop | 105234 + www.bmj.com | link-loop | 100354 + www.researchgate.net | redirect-loop | 84366 + www.cambridge.org | link-loop | 83171 + jamanetwork.com | no-pdf-link | 75053 + febs.onlinelibrary.wiley.com | no-pdf-link | 74872 + www.jstor.org | redirect-loop | 72059 + journals.sagepub.com | no-pdf-link | 63028 + science.sciencemag.org | redirect-loop | 62927 + profile.thieme.de | no-pdf-link | 62406 + cyberleninka.ru | redirect-loop | 56733 + link.springer.com | link-loop | 47608 + physoc.onlinelibrary.wiley.com | no-pdf-link | 30180 + science.sciencemag.org | link-loop | 29908 + papers.ssrn.com | redirect-loop | 27255 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789 + www.computer.org | no-pdf-link | 26444 + watermark.silverchair.com | terminal-bad-status | 25934 + www.nature.com | redirect-loop | 25306 + (30 rows) diff --git a/notes/ingest/2020-03_s2.md b/notes/ingest/2020-03_s2.md new file mode 100644 index 0000000..fedaba0 --- /dev/null +++ b/notes/ingest/2020-03_s2.md @@ -0,0 +1,35 @@ + +Crawled some 6 million new PDFs from pdfs.semanticscholar.org. Should get these +ingested, as well as any previous existing content. + +Also, there are a bunch of PDF outlinks to the web; should do S2-specific +matching and ingest of those. + +There are a few categories of paper from pdfs.s.o: + +1. we had previous GWB crawl, didn't re-crawl +2. we had PDF from elsewhere on the web, didn't re-crawl +3. crawled successfully +4. crawl failed + +In this ingest, want to get all of categories 1 and 3. Could try to do this by +dumping sandcrawler CDX table matching pdfs.s.o (which includes recent crawl), +and join that against the ingest request list. + +For other random web URLs, can do the usual persist/backfill/recrawl pipeline. + +## Create Seedlist + + zcat s2-corpus-pdfUrls.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-pdfUrls.2019.ingest_request.json.gz + zcat s2-corpus-s2PdfUrl.json.gz | parallel -j5 --linebuffer --round-robin --pipe ./s2_ingestrequest.py - | pv -l | gzip > s2-corpus-s2PdfUrl.2019.ingest_request.json.gz + + zcat s2-corpus-s2PdfUrl.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-s2PdfUrl.id_list + zcat s2-corpus-pdfUrls.json.gz | jq .id -r | sort -u -S 2G > s2-corpus-pdfUrls.id_list + + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_hosted_ingestrequest.json.gz + zcat s2-corpus-pdfUrls.2019.ingest_request.json.gz s2-corpus-s2PdfUrl.2019.ingest_request.json.gz | rg -v pdfs.semanticscholar.org | sort -u -S 3G | gzip > s2_external_ingestrequest.json.gz + + zcat s2_external_ingestrequest.json.gz | wc -l + 41201427 + zcat s2_hosted_ingestrequest.json.gz | wc -l + 23345761 diff --git a/notes/ingest/2020-04-13_covid19.md b/notes/ingest/2020-04-13_covid19.md new file mode 100644 index 0000000..b442d69 --- /dev/null +++ b/notes/ingest/2020-04-13_covid19.md @@ -0,0 +1,73 @@ + +Want to ensure seedlists from Wanfang and CNKI are captured in wayback. + +Wanfang URLs seem normal. Let's just submit them in a single queue via SPNv2. +They are heterogenous after redirect. + +CNKI are trickier. The PDF URLs definitely can't be crawled directly... but the +info ones probably can, then crawl on to PDF? At least some seem to capture Ok. + +Need scope and identifiers for ingest requests. Let's do: + + cnki_covid19 / <ident> + wanfang_covid19 / <ident> + +Source: scrape-covid19 + +## Commands + + # in sandcrawler pipenv + cat ~/code/covid19.fatcat.wiki/extra/scrape/cnki_metadata.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/cnki_ingest_request.2020-04-14.json + cat ~/code/covid19.fatcat.wiki/extra/scrape/wanfang*.2020-04-14.json | ./scripts/covid2ingestrequest.py - > ~/code/covid19.fatcat.wiki/extra/scrape/wanfang_ingest_request.2020-04-14.json + + + cat /tmp/wanfang_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 4 + cat /tmp/cnki_ingest_request.2020-04-14.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 8 + +## Status + + SELECT ingest_request.ingest_type, + ingest_file_result.status, + COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'scrape-covid19' + GROUP BY ingest_request.ingest_type, ingest_file_result.status + ORDER BY COUNT(*) DESC; + +2020-04-15: + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | spn2-cdx-lookup-failure | 1588 + pdf | success | 671 + pdf | gateway-timeout | 507 + pdf | no-pdf-link | 181 + pdf | wayback-error | 30 + pdf | spn2-error:job-failed | 20 + pdf | spn2-error | 7 + pdf | spn2-error:soft-time-limit-exceeded | 3 + pdf | spn2-error:pending | 2 + (9 rows) + +## Re-Try + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'scrape-covid19' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status != 'no-pdf-link' + AND ingest_file_result.status != 'link-loop' + ) TO '/grande/snapshots/reingest_covid19.rows.json'; + + ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_covid19.rows.json | shuf > reingest_covid19.json + + cat reingest_covid19.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 9 + diff --git a/notes/ingest/2020-04_datacite.md b/notes/ingest/2020-04_datacite.md new file mode 100644 index 0000000..0fc7e67 --- /dev/null +++ b/notes/ingest/2020-04_datacite.md @@ -0,0 +1,121 @@ + +After the broad datacite crawl, want to ingest paper PDFs into fatcat. But many +of the DOIs are for, eg, datasets, and don't want to waste time on those. + +Instead of using full ingest request file from the crawl, will generate a new +ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. + +## Generate Requests + + ./fatcat_ingest.py --allow-non-oa --release-types article-journal,paper-conference,article,report,thesis,book,chapter query "doi_registrar:datacite" | pv -l > /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json + => Expecting 8905453 release objects in search queries + => 8.91M 11:49:50 [ 209 /s] + => Counter({'elasticsearch_release': 8905453, 'ingest_request': 8905453, 'estimate': 8905453}) + +## Bulk Ingest + + cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Ingest Stats + +Note that this will have a small fraction of non-datacite results mixed in (eg, +from COVID-19 targeted crawls): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + no-pdf-link | 4646767 + redirect-loop | 1447229 + no-capture | 860235 + success | 849501 + terminal-bad-status | 174869 + cdx-error | 159805 + wayback-error | 18076 + wrong-mimetype | 11169 + link-loop | 8410 + gateway-timeout | 4034 + spn2-cdx-lookup-failure | 510 + petabox-error | 339 + null-body | 251 + spn2-error | 19 + spn2-error:job-failed | 14 + bad-gzip-encoding | 13 + timeout | 5 + spn2-error:soft-time-limit-exceeded | 4 + invalid-host-resolution | 2 + spn2-error:pending | 1 + (20 rows) + +Top domains/statuses (including success): + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------------+---------------------+-------- + ssl.fao.org | no-pdf-link | 862277 + www.e-periodica.ch | no-pdf-link | 746781 + www.researchgate.net | redirect-loop | 664524 + dlc.library.columbia.edu | no-pdf-link | 493111 + www.die-bonn.de | redirect-loop | 352903 + figshare.com | no-pdf-link | 319709 + statisticaldatasets.data-planet.com | no-pdf-link | 309584 + catalog.paradisec.org.au | redirect-loop | 225396 + zenodo.org | no-capture | 193201 + digi.ub.uni-heidelberg.de | no-pdf-link | 184974 + open.library.ubc.ca | no-pdf-link | 167841 + zenodo.org | no-pdf-link | 130617 + www.google.com | no-pdf-link | 111312 + www.e-manuscripta.ch | no-pdf-link | 79192 + ds.iris.edu | no-pdf-link | 77649 + data.inra.fr | no-pdf-link | 69440 + www.tib.eu | no-pdf-link | 63872 + www.egms.de | redirect-loop | 53877 + archaeologydataservice.ac.uk | redirect-loop | 52838 + d.lib.msu.edu | no-pdf-link | 45297 + www.e-rara.ch | no-pdf-link | 45163 + springernature.figshare.com | no-pdf-link | 42527 + boris.unibe.ch | no-pdf-link | 40816 + www.research-collection.ethz.ch | no-capture | 40350 + spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059 + repository.dri.ie | terminal-bad-status | 32760 + othes.univie.ac.at | no-pdf-link | 32558 + repositories.lib.utexas.edu | no-capture | 31526 + posterng.netkey.at | no-pdf-link | 30315 + zenodo.org | terminal-bad-status | 29614 + (30 rows) + diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md new file mode 100644 index 0000000..a5e3bb1 --- /dev/null +++ b/notes/ingest/2020-04_unpaywall.md @@ -0,0 +1,312 @@ + +A new snapshot was released in April 2020 (the snapshot is from 2020-02-25, but +not released for more than a month). + +Primary goal is: + +- generate ingest requests for only *new* URLs +- bulk ingest these new URLs +- crawl any no-capture URLs from that batch +- re-bulk-ingest the no-capture batch +- analytics on failed ingests. eg, any particular domains that are failing to crawl + +This ingest pipeline was started on 2020-04-07 by bnewbold. + +Ran through the first two steps again on 2020-05-03 after unpaywall had +released another dump (dated 2020-04-27). + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-02-25T115244.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json + => 24.7M 5:17:03 [ 1.3k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-02-25.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 24.7M + => Worker: Counter({'total': 24712947, 'insert-requests': 4282167, 'update-requests': 0}) + +Second time: + + # in sandcrawler pipenv on aitio + zcat /schnell/UNPAYWALL-PDF-CRAWL-2020-04/unpaywall_snapshot_2020-04-27T153236.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json + => 25.2M 3:16:28 [2.14k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 25189390, 'insert-requests': 1408915, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 25189390, 'total': 25189390}) + + +## Dump new URLs and Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-04-08.rows.json'; + => 3696189 + + WARNING: forgot to transform from rows to ingest requests. + + cat /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Second time: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/unpaywall_noingest_2020-05-03.rows.json'; + => 1799760 + + WARNING: forgot to transform from rows to ingest requests. + + cat /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Dump no-capture, Run Crawl + +Make two ingest request dumps: one with "all" URLs, which we will have heritrix +attempt to crawl, and then one with certain domains filtered out, which we may +or may not bother trying to ingest (due to expectation of failure). + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + ) TO '/grande/snapshots/unpaywall_nocapture_all_2020-05-04.rows.json'; + => 2734145 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + => 2602408 + +NOTE: forgot here to transform from "rows" to ingest requests. + +Not actually a very significant size difference after all. + +See `journal-crawls` repo for details on seedlist generation and crawling. + +## Re-Ingest Post-Crawl + +NOTE: if we *do* want to do cleanup eventually, could look for fatcat edits +between 2020-04-01 and 2020-05-25 which have limited "extra" metadata (eg, no +evidence or `oa_status`). + +The earlier bulk ingests were done wrong (forgot to transform from rows to full +ingest request docs), so going to re-do those, which should be a superset of +the nocapture crawl URLs.: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-04-08.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-04-08.json + => 1.26M 0:00:58 [21.5k/s] + => previously: 3,696,189 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-05-03.rows.json | pv -l > /grande/snapshots/unpaywall_noingest_2020-05-03.json + => 1.26M 0:00:56 [22.3k/s] + +Crap, looks like the 2020-04-08 segment got overwriten with 2020-05 data by +accident. Hrm... need to re-ingest *all* recent unpaywall URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + ) TO '/grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json'; + => COPY 5691106 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json + => 5.69M 0:04:26 [21.3k/s] + +Start small: + + cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Looks good (whew), run the full thing: + + cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Post-ingest stats (2020-08-28) + +Overall status: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 22063013 + no-pdf-link | 2192606 + redirect-loop | 1471135 + terminal-bad-status | 995106 + no-capture | 359440 + cdx-error | 358909 + wrong-mimetype | 111685 + wayback-error | 50705 + link-loop | 29359 + null-body | 13667 + gateway-timeout | 3689 + spn2-cdx-lookup-failure | 1229 + petabox-error | 1007 + redirects-exceeded | 747 + invalid-host-resolution | 464 + spn2-error | 107 + spn2-error:job-failed | 91 + bad-redirect | 26 + spn2-error:soft-time-limit-exceeded | 9 + bad-gzip-encoding | 5 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + academic.oup.com | no-pdf-link | 415441 + watermark.silverchair.com | terminal-bad-status | 345937 + www.tandfonline.com | no-pdf-link | 262488 + journals.sagepub.com | no-pdf-link | 235707 + onlinelibrary.wiley.com | no-pdf-link | 225876 + iopscience.iop.org | terminal-bad-status | 170783 + www.nature.com | redirect-loop | 145522 + www.degruyter.com | redirect-loop | 131898 + files-journal-api.frontiersin.org | terminal-bad-status | 126091 + pubs.acs.org | no-pdf-link | 119223 + society.kisti.re.kr | no-pdf-link | 112401 + www.ahajournals.org | no-pdf-link | 105953 + dialnet.unirioja.es | terminal-bad-status | 96505 + www.cell.com | redirect-loop | 87560 + www.ncbi.nlm.nih.gov | redirect-loop | 49890 + ageconsearch.umn.edu | redirect-loop | 45989 + ashpublications.org | no-pdf-link | 45833 + pure.mpg.de | redirect-loop | 45278 + www.degruyter.com | terminal-bad-status | 43642 + babel.hathitrust.org | terminal-bad-status | 42057 + osf.io | redirect-loop | 41119 + scialert.net | no-pdf-link | 39009 + dialnet.unirioja.es | redirect-loop | 38839 + www.jci.org | redirect-loop | 34209 + www.spandidos-publications.com | redirect-loop | 33167 + www.journal.csj.jp | no-pdf-link | 30915 + journals.openedition.org | redirect-loop | 30409 + www.valueinhealthjournal.com | redirect-loop | 30090 + dergipark.org.tr | no-pdf-link | 29146 + journals.ametsoc.org | no-pdf-link | 29133 + (30 rows) + +Enqueue internal failures for re-ingest: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/unpaywall_errors_2020-08-28.rows.json'; + => 409606 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_errors_2020-08-28.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_errors_2020-08-28.requests.json + + cat /grande/snapshots/unpaywall_errors_2020-08-28.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +And after *that* (which ran quickly): + + status | count + -------------------------------------+---------- + success | 22281874 + no-pdf-link | 2258352 + redirect-loop | 1499251 + terminal-bad-status | 1004781 + no-capture | 401333 + wrong-mimetype | 112068 + cdx-error | 32259 + link-loop | 30137 + null-body | 13886 + wayback-error | 11653 + gateway-timeout | 3689 + spn2-cdx-lookup-failure | 1229 + petabox-error | 1036 + redirects-exceeded | 749 + invalid-host-resolution | 464 + spn2-error | 107 + spn2-error:job-failed | 91 + bad-redirect | 26 + spn2-error:soft-time-limit-exceeded | 9 + bad-gzip-encoding | 5 + (20 rows) + +22063013 -> 22281874 = + 218,861 success, not bad! diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md new file mode 100644 index 0000000..fe22c75 --- /dev/null +++ b/notes/ingest/2020-05_oai_pmh.md @@ -0,0 +1,428 @@ + +Primary Goal: start large crawl of OAI landing pages that we haven't seen + +Fields of interest for ingest: +- oai identifer +- doi +- formats +- urls (maybe also "relations") +- types (type+stage) + +## Other Tasks + +About 150 million total lines. + +Types coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt + +Dump all ISSNs, with counts, quick check how many are in chocula/fatcat + + zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt + +Language coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt + +Format coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt + => 150M 0:56:14 [44.7k/s] + +Have a DOI? + + zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l + => 16,013,503 + + zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt + => 11,940,950 + +## Transform, Load, Bulk Ingest + + zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz + => 80M 6:36:55 [3.36k/s] + + time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request - + => 80M 4:00:21 [5.55k/s] + => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963}) + + => real 240m21.207s + => user 85m12.576s + => sys 3m29.580s + + select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai'; + => 51,185,088 + +Why so many (30 million) skipped? Not unique? + + zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l + => 51,185,088 + + zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt + wc -l request_url.txt + => 50,002,674 request_url.txt + + zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt + wc -l requires_oai.txt + => 34,622,083 requires_oai.txt + +Yup, tons of duplication. And remember this is exact URL, not SURT or similar. + +How many of these are URLs we have seen and ingested already? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + | 49491452 + success | 1469113 + no-capture | 134611 + redirect-loop | 59666 + no-pdf-link | 8947 + cdx-error | 7561 + terminal-bad-status | 6704 + null-body | 5042 + wrong-mimetype | 879 + wayback-error | 722 + petabox-error | 198 + gateway-timeout | 86 + link-loop | 51 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + spn2-error | 4 + bad-gzip-encoding | 4 + spn2-error:job-failed | 2 + (18 rows) + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/oai_noingest_20200506.rows.json'; + => COPY 49491452 + + WARNING: should have transformed from rows to requests here + + cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Crawl and re-ingest + +Updated stats after ingest (NOTE: ingest requests not really formed correctly, +but doesn't matter because fatcat wasn't importing these anyways): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 42565875 + success | 5227609 + no-pdf-link | 2156341 + redirect-loop | 559721 + cdx-error | 260446 + wrong-mimetype | 148871 + terminal-bad-status | 109725 + link-loop | 92792 + null-body | 30688 + | 15287 + petabox-error | 11109 + wayback-error | 6261 + skip-url-blocklist | 184 + gateway-timeout | 86 + bad-gzip-encoding | 25 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + bad-redirect | 15 + spn2-error | 4 + spn2-error:job-failed | 2 + (20 rows) + +Dump again for crawling: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') + ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; + +Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains: + + 4876135 www.kb.dk REMOVE: too large and generic + 3110009 kb-images.kb.dk REMOVE: dead? + 1274638 mdz-nbn-resolving.de REMOVE: maybe broken + 982312 aggr.ukm.um.si REMOVE: maybe broken + +And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so +expecting at least 11,052,439 `no-capture` ingest results (and should probably +filter for these or even delete from the ingest request table). + +Ingest progress: + + 2020-08-05 14:02: 32,571,018 + 2020-08-06 13:49: 31,195,169 + 2020-08-07 10:11: 29,986,169 + 2020-08-10 10:43: 26,497,196 + 2020-08-12 11:02: 23,811,845 + 2020-08-17 13:34: 19,460,502 + 2020-08-20 09:49: 15,069,507 + 2020-08-25 09:56: 9,397,035 + 2020-09-02 15:02: 305,889 (72k longest queue) + 2020-09-03 14:30: done + +## Post-ingest stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 16804277 + no-pdf-link | 14895249 + success | 13898603 + redirect-loop | 2709730 + cdx-error | 827024 + terminal-bad-status | 740037 + wrong-mimetype | 604242 + link-loop | 532553 + null-body | 95721 + wayback-error | 41864 + petabox-error | 19204 + | 15287 + gateway-timeout | 510 + bad-redirect | 318 + skip-url-blocklist | 184 + bad-gzip-encoding | 114 + timeout | 78 + spn2-cdx-lookup-failure | 59 + invalid-host-resolution | 19 + blocked-cookie | 6 + (20 rows) + +Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth +dumping the full kafka result topic, filter to OAI requests, and extracting the +missing URLs. + +Top counts by OAI prefix: + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 25; + + oai_prefix | success | total + --------------------------+---------+--------- + kb.dk | 0 | 7989412 (excluded) + repec | 1118591 | 2783448 + bnf.fr | 0 | 2187277 + hispana.mcu.es | 19404 | 1492639 + bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?) + hal | 564700 | 1049607 + ukm.si | 0 | 982468 (excluded) + hsp.org | 0 | 810281 + www.irgrid.ac.cn | 17578 | 748828 + cds.cern.ch | 72811 | 688091 + americanae.aecid.es | 69678 | 572792 + biodiversitylibrary.org | 2121 | 566154 + juser.fz-juelich.de | 22777 | 518551 + espace.library.uq.edu.au | 6494 | 508960 + igi.indrastra.com | 58689 | 478577 + archive.ugent.be | 63654 | 424014 + hrcak.srce.hr | 395031 | 414897 + zir.nsk.hr | 153889 | 397200 + renati.sunedu.gob.pe | 78399 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7963 | 354529 + generic.eprints.org | 261221 | 340470 + invenio.nusl.cz | 6184 | 325867 + evastar-karlsruhe.de | 62044 | 317952 + quod.lib.umich.edu | 5 | 309135 + (25 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 30; + + + oai_prefix | status | count + --------------------------+---------------+--------- + kb.dk | no-capture | 7955231 (excluded) + bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?) + repec | success | 1118591 + hispana.mcu.es | no-pdf-link | 1118092 + bnf.fr | no-capture | 1100591 + ukm.si | no-capture | 976004 (excluded) + hsp.org | no-pdf-link | 773496 + repec | no-pdf-link | 625629 + bnf.fr | no-pdf-link | 607813 + hal | success | 564700 + biodiversitylibrary.org | no-pdf-link | 531409 + cds.cern.ch | no-capture | 529842 + repec | redirect-loop | 504393 + juser.fz-juelich.de | no-pdf-link | 468813 + bnf.fr | redirect-loop | 436087 + americanae.aecid.es | no-pdf-link | 409954 + hrcak.srce.hr | success | 395031 + www.irgrid.ac.cn | no-pdf-link | 362087 + hal | no-pdf-link | 352111 + www.irgrid.ac.cn | no-capture | 346963 + espace.library.uq.edu.au | no-pdf-link | 315302 + igi.indrastra.com | no-pdf-link | 312087 + repec | no-capture | 309882 + invenio.nusl.cz | no-pdf-link | 302657 + hypotheses.org | no-pdf-link | 298750 + rour.neicon.ru | redirect-loop | 291922 + renati.sunedu.gob.pe | no-capture | 276388 + t2r2.star.titech.ac.jp | no-pdf-link | 264109 + generic.eprints.org | success | 261221 + quod.lib.umich.edu | no-pdf-link | 253937 + (30 rows) + +If we remove excluded prefixes, and some large/generic prefixes (bnf.fr, +hispana.mcu.es, hsp.org), then the aggregate counts are: + + no-capture | 16,804,277 -> 5,502,242 + no-pdf-link | 14,895,249 -> 12,395,848 + +Top status by terminal domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + WHERE t1.domain != '' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ----------------------------------+---------------+-------- + hispana.mcu.es | no-pdf-link | 709701 (national scope) + gallica.bnf.fr | no-pdf-link | 601193 (national scope) + discover.hsp.org | no-pdf-link | 524212 (historical) + www.biodiversitylibrary.org | no-pdf-link | 479288 + gallica.bnf.fr | redirect-loop | 435981 (national scope) + hrcak.srce.hr | success | 389673 + hemerotecadigital.bne.es | no-pdf-link | 359243 + juser.fz-juelich.de | no-pdf-link | 345112 + espace.library.uq.edu.au | no-pdf-link | 304299 + invenio.nusl.cz | no-pdf-link | 302586 + igi.indrastra.com | no-pdf-link | 292006 + openrepository.ru | redirect-loop | 291555 + hal.archives-ouvertes.fr | success | 278134 + t2r2.star.titech.ac.jp | no-pdf-link | 263971 + bib-pubdb1.desy.de | no-pdf-link | 254879 + quod.lib.umich.edu | no-pdf-link | 250382 + encounters.hsp.org | no-pdf-link | 248132 + americanae.aecid.es | no-pdf-link | 245295 + www.irgrid.ac.cn | no-pdf-link | 242496 + publikationen.bibliothek.kit.edu | no-pdf-link | 222041 + www.sciencedirect.com | no-pdf-link | 211756 + dialnet.unirioja.es | redirect-loop | 203615 + edoc.mpg.de | no-pdf-link | 195526 + bibliotecadigital.jcyl.es | no-pdf-link | 184671 + hal.archives-ouvertes.fr | no-pdf-link | 183809 + www.sciencedirect.com | redirect-loop | 173439 + lup.lub.lu.se | no-pdf-link | 165788 + orbi.uliege.be | no-pdf-link | 158313 + www.erudit.org | success | 155986 + lib.dr.iastate.edu | success | 153384 + (30 rows) + +Follow-ups are TBD but could include: +- crawling the ~5m no-capture links directly (eg, not `base_url`) from the + ingest result JSON, while retaining the ingest request for later re-ingest +- investigating and iterating on PDF link extraction, both for large platforms + and randomly sampled from long tail +- classifying OAI prefixes by type (subject repository, institutional + repository, journal, national-library, historical docs, greylit, law, etc) +- running pdftrio over some/all of this corpus diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md new file mode 100644 index 0000000..36d00a1 --- /dev/null +++ b/notes/ingest/2020-05_pubmed.md @@ -0,0 +1,10 @@ + +From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1. + +Test small batch: + + zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md new file mode 100644 index 0000000..1d33162 --- /dev/null +++ b/notes/ingest/2020-07_mag.md @@ -0,0 +1,353 @@ + +Using 2020-06-25 upstream MAG corpus snapshot. + +Ran munging from `scratch:ingest/mag` notes first. + +Expecting a couple million new ingest request URLs; this is the first "patch" +MAG ingest on top of existing already-run requests. + +Planning to skip the initial bulk ingest step, on the assumption that new URLs +have either been ingested already (eg, via continuous ingest pipeline) or need +crawling. + +## Generate Requests + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json + => 28.7M 2:36:48 [3.06k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json + => 5.66M 0:29:28 [ 3.2k/s] + +## Persist Ingest Requests + + # small sample + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0}) + + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0}) + +## Crawl/Dupe Status + +Overall status for old and new seeds, filtering out large (blocking) +publishers: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 19477651 + | 8238898 + redirect-loop | 2036494 + link-loop | 1330036 + no-pdf-link | 1304820 + terminal-bad-status | 648150 + no-capture | 545785 + gateway-timeout | 200143 + cdx-error | 149995 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 57052 + wayback-error | 41032 + invalid-host-resolution | 37203 + petabox-error | 11167 + null-body | 6662 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 77 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + | 8238851 + success | 787174 + no-capture | 42864 + redirect-loop | 31718 + terminal-bad-status | 31493 + no-pdf-link | 13025 + cdx-error | 11275 + wrong-mimetype | 6238 + link-loop | 3365 + wayback-error | 748 + gateway-timeout | 506 + null-body | 191 + spn2-cdx-lookup-failure | 99 + petabox-error | 89 + invalid-host-resolution | 70 + spn2-error | 7 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + bad-gzip-encoding | 1 + (19 rows) + +Where are no-capture results terminating? May need to add or update heritrix +crawl config so that we get better yield without needing to do SPNv2 crawling. + + SELECT initial_domain, terminal_domain, COUNT(*) + FROM ( + SELECT + ingest_file_result.status as status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + ) t1 + GROUP BY initial_domain, terminal_domain + ORDER BY COUNT DESC + LIMIT 25; + + initial_domain | terminal_domain | count + ---------------------------------+---------------------+-------- + www.researchgate.net | | 334145 + academic.oup.com | | 205820 + www.tandfonline.com | | 148638 + journals.sagepub.com | | 144196 + muse.jhu.edu | | 55957 + hrcak.srce.hr | | 25317 + www.omicsonline.org | | 22426 + link.springer.com | | 21044 + iopscience.iop.org | | 12385 + bioone.org | | 9097 + tandfonline.com | | 8512 + or.nsfc.gov.cn | | 4823 + ieeexplore.ieee.org | ieeexplore.ieee.org | 4398 + pubs.acs.org | | 3708 + archive-ouverte.unige.ch | | 2743 + dergipark.ulakbim.gov.tr | | 2677 + hal.archives-ouvertes.fr | | 1258 + dergipark.org.tr | | 1207 + apo.org.au | | 1186 + spire.sciencespo.fr | | 989 + cyberleninka.ru | | 895 + lirias.kuleuven.be | | 855 + tel.archives-ouvertes.fr | | 786 + pub.uni-bielefeld.de | | 728 + www.research-collection.ethz.ch | | 670 + (25 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status IS NULL) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json'; + => 8784683 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json + +Seedlist transform from here on covered in MAG crawl notes. + +## Bulk Ingest + +Run ingest requests on everything we crawled: + + cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Small sample: + + head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Updated Overall Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24574294 + redirect-loop | 2633731 + no-capture | 2458694 + no-pdf-link | 1896871 + link-loop | 1510899 + terminal-bad-status | 878821 + cdx-error | 387574 + gateway-timeout | 200246 + | 170304 + wayback-error | 97572 + spn2-cdx-lookup-failure | 80284 + wrong-mimetype | 65097 + invalid-host-resolution | 37204 + petabox-error | 12097 + null-body | 8549 + spn2-error | 1706 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + (20 rows) + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24557382 + redirect-loop | 2630582 + no-capture | 1947066 + no-pdf-link | 1778206 + link-loop | 1510790 + terminal-bad-status | 857173 + cdx-error | 384525 + gateway-timeout | 200143 + wayback-error | 96390 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 64908 + invalid-host-resolution | 37203 + petabox-error | 12087 + null-body | 8548 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + | 69 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+--------- + success | 5860601 + no-capture | 1489959 + redirect-loop | 619121 + no-pdf-link | 473703 + terminal-bad-status | 234753 + cdx-error | 231575 + link-loop | 184093 + wayback-error | 56068 + wrong-mimetype | 14046 + null-body | 2068 + petabox-error | 1006 + gateway-timeout | 506 + spn2-cdx-lookup-failure | 99 + invalid-host-resolution | 70 + | 22 + bad-redirect | 13 + spn2-error | 7 + timeout | 3 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + (20 rows) + diff --git a/notes/ingest/2020-08_daily_improvements.md b/notes/ingest/2020-08_daily_improvements.md new file mode 100644 index 0000000..da57065 --- /dev/null +++ b/notes/ingest/2020-08_daily_improvements.md @@ -0,0 +1,202 @@ + +Goal is to increase rate of successful daily changelog crawling, but reduce +wasted attempts. + +Status by domain, past 30 days: + + domain | status | count + --------------------------------------+-----------------+------- + arxiv.org | success | 21792 + zenodo.org | success | 10646 + res.mdpi.com | success | 10449 + springernature.figshare.com | no-pdf-link | 10430 + s3-eu-west-1.amazonaws.com | success | 8966 + zenodo.org | no-pdf-link | 8137 + hkvalidate.perfdrive.com | no-pdf-link | 5943 + www.ams.org:80 | no-pdf-link | 5799 + assets.researchsquare.com | success | 4651 + pdf.sciencedirectassets.com | success | 4145 + fjfsdata01prod.blob.core.windows.net | success | 3500 + sage.figshare.com | no-pdf-link | 3174 + onlinelibrary.wiley.com | no-pdf-link | 2869 + www.e-periodica.ch | no-pdf-link | 2709 + revistas.uned.es | success | 2631 + figshare.com | no-pdf-link | 2500 + www.sciencedirect.com | link-loop | 2477 + linkinghub.elsevier.com | gateway-timeout | 1878 + downloads.hindawi.com | success | 1819 + www.scielo.br | success | 1691 + jps.library.utoronto.ca | success | 1590 + www.ams.org | no-pdf-link | 1568 + digi.ub.uni-heidelberg.de | no-pdf-link | 1496 + research-repository.griffith.edu.au | success | 1412 + journals.plos.org | success | 1330 + (25 rows) + +Status by DOI prefix, past 30 days: + + doi_prefix | status | count + ------------+-------------------------+------- + 10.6084 | no-pdf-link | 14410 <- figshare; small fraction success + 10.6084 | success | 4007 + 10.6084 | cdx-error | 1746 + + 10.13140 | gateway-timeout | 9689 <- researchgate + 10.13140 | cdx-error | 4154 + + 10.5281 | success | 9408 <- zenodo + 10.5281 | no-pdf-link | 6079 + 10.5281 | cdx-error | 3200 + 10.5281 | wayback-error | 2098 + + 10.1090 | no-pdf-link | 7420 <- AMS (ams.org) + + 10.3390 | success | 6599 <- MDPI + 10.3390 | cdx-error | 3032 + 10.3390 | wayback-error | 1636 + + 10.1088 | no-pdf-link | 3227 <- IOP science + + 10.1101 | gateway-timeout | 3168 <- coldspring harbor: press, biorxiv, medrxiv, etc + 10.1101 | cdx-error | 1147 + + 10.21203 | success | 3124 <- researchsquare + 10.21203 | cdx-error | 1181 + + 10.1016 | success | 3083 <- elsevier + 10.1016 | cdx-error | 2465 + 10.1016 | gateway-timeout | 1682 + 10.1016 | wayback-error | 1567 + + 10.25384 | no-pdf-link | 3058 <- sage figshare + 10.25384 | success | 2456 + + 10.1007 | gateway-timeout | 2913 <- springer + 10.1007 | cdx-error | 1164 + + 10.5944 | success | 2831 + 10.1186 | success | 2650 + 10.5169 | no-pdf-link | 2644 <- www.e-periodica.ch + 10.3389 | success | 2279 + 10.24411 | gateway-timeout | 2184 <- cyberleninka.ru + 10.1038 | gateway-timeout | 2143 <- nature group + 10.1177 | gateway-timeout | 2038 <- SAGE + 10.11588 | no-pdf-link | 1574 <- journals.ub.uni-heidelberg.de (OJS?) + 10.25904 | success | 1416 + 10.1155 | success | 1304 + 10.21994 | no-pdf-link | 1268 <- loar.kb.dk + 10.18720 | spn2-cdx-lookup-failure | 1232 <- elib.spbstu.ru + 10.24411 | cdx-error | 1202 + 10.1055 | no-pdf-link | 1170 <- thieme-connect.de + (40 rows) + +code changes for ingest: +x hkvalidate.perfdrive.com: just bail when we see this +x skip large publishers which gateway-timeout (for now) + - springerlink (10.1007) + - nature group (10.1038) + - SAGE (10.1177) + - IOP (10.1088) + +fatcat: +x figshare (by `doi_prefix`): if not versioned (suffix), skip crawl +x zenodo: also try to not crawl if unversioned (group) +x figshare import metadata + +sandcrawler: +x ends with `cookieAbsent` or `cookieSet=1` -> status as cookie-blocked +x https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist +x verify that we do quick-get for arxiv.org + europmc.org (+ figshare/zenodo?) + => we were not! +x shorten post-SPNv2 CDX pause? for throughput, given that we are re-trying anyways +x ensure that we store uncrawled URL somewhere on no-capture status + => in HTML or last of hops + => not in DB, but that is a bigger change + +- try to get un-blocked: + - coldspring harbor has been blocking since 2020-06-22? yikes! + - cyberleninka.ru + - arxiv.org + +- no-pdf-link + x www.ams.org (10.1090) + => these seem to be stale captures, eg from 2008. newer captures have citation_pdf_url + => should consider recrawling all of ams.org? + => not sure why these crawl requests are happening only now + => on the order of 15k OA articles not in ia; 43k total not preserved + => force recrawl OA subset (DONE) + x www.e-periodica.ch (10.5169) + => TODO: dump un-preserved URLs, transform to PDF urls, heritrix crawl, re-ingest + x digi.ub.uni-heidelberg.de (10.11588) + => TODO: bulk re-enqueue? then heritrix crawl? + - https://loar.kb.dk/handle/1902/6988 (10.21994) + => TODO: bulk re-enqueue + => site was updated recently (august 2020); now it crawls fine. need to re-ingest all? + => 7433 hits + - thieme-connect.de (10.1055) + => 600k+ missing + => TODO: bulk re-enqueue? then heritrix crawl? + => https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist + => generally just need to re-crawl all? + +Unresolved: +- why so many spn2-errors on https://elib.spbstu.ru/ (10.18720)? + +## figshare + +10.6084 regular figshare +10.25384 SAGE figshare + +For sage, "collections" are bogus? can we detect these in datacite metadata? + +If figshare types like: + + ris: "GEN", + bibtex: "misc", + citeproc: "article", + schemaOrg: "Collection", + resourceType: "Collection", + resourceTypeGeneral: "Collection" + +then mark as 'stub'. + +"Additional file" items don't seem like "stub"; -> "component". + +title:"Figure {} from " -> component + +current types are mostly: article, stub, dataset, graphic, article-journal + +If DOI starts with "sage.", then publisher is "Sage" (not figshare). Container +name should be... sage.figshare.com? + +set version to the version from DOI + +## zenodo + +doi_prefix: 10.5281 + +if on zenodo, and has a "Identical to" relation, then this is a pre-print. in +that case, drop container_id and set container_name to zenodo.org. *But*, there +are some journals now publishing exclusively to zenodo.org, so retain that +metadata. examples: + + "Detection of keyboard vibrations and effects on perceived piano quality" + https://fatcat.wiki/release/mufzkdgt2nbzfha44o7p7gkrpy + + "Editing LAF: Educate, don't defend!" + https://zenodo.org/record/2583025 + +version number not available in zenodo metadata + +## Gitlab MR Notes + +The main goal of this group of changes is to do a better job at daily ingest. + +Currently we have on the order of 20k new releases added to the index every day, and about half of them get are marked as OA (either CC license or via container being in DOAJ or ROAD), and pass some filters (eg, release_type), and are selected for ingest. Of those, about half fail to crawl to fulltext, either due to blocking (gateway-timeout, cookie tests, anti-bot detection, loginwall, etc). On the other hand, we don't attempt to crawl lots of "bronze" OA, which is content that is available from the publisher website, but isn't marked explicitly OA. + +Based on investigating daily crawling from the past month (will commit these notes to sandcrawler soon), I have identified some DOI prefixes that almost always fail ingest via SPNv2. I also have some patches to sandcrawler ingest to improve ability to crawl some large repositories etc. + +Some of the biggest "OA but failed to crawl" are from figshare and zenodo, which register a relatively large fraction of daily OA DOIs. We want to crawl most of that content, but both of these platforms register at least DOIs for each piece of content (a "group" DOI and a "versioned" DOI), and we only need to crawl one. There were also some changes needed to release-type filtering and assignment specific to these platforms, or based on the title of entities. + +This MR mixes changes to the datacite metadata import routing (including some refactors out of the main parse_record method) and behavior changes to the entity updater (which is where the code to decide about whether to send an ingest request on release creation lives). I will have a separate MR for importer metadata changes that don't impact ingest behavior. + diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md new file mode 100644 index 0000000..f5c853d --- /dev/null +++ b/notes/ingest/2020-09_oa_doi.md @@ -0,0 +1,352 @@ + +It seems that many gold OA DOIs on were not ingesting simply because the HTML +url extraction was not working for a particular version of OJS. + +Let's re-try all ~2.5 million of these in bulk mode and see how many are +'no-capture' vs. other errors, then possibly re-crawl a large number. + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json + Expecting 2569876 release objects in search queries + Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034}) + +Enqueue + + cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started at about: + + Thu Sep 17 00:15:00 UTC 2020 + 2020-09-17T00:15:00Z + +## Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND ingest_file_result.updated >= '2020-09-16' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 513462 + success | 206042 + no-pdf-link | 186779 + terminal-bad-status | 40372 + redirect-loop | 33103 + cdx-error | 24078 + link-loop | 13494 + spn2-cdx-lookup-failure | 10247 + gateway-timeout | 4407 + wrong-mimetype | 3213 + petabox-error | 866 + null-body | 449 + spn2-error | 217 + wayback-error | 129 + spn2-error:job-failed | 64 + bad-redirect | 6 + spn2-error:soft-time-limit-exceeded | 1 + (17 rows) + +This was only about half the requests. Try... broader? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 579952 + success | 387325 + no-pdf-link | 380406 + terminal-bad-status | 63743 + redirect-loop | 53893 + cdx-error | 46024 + spn2-cdx-lookup-failure | 28347 + link-loop | 22573 + gateway-timeout | 11686 + wrong-mimetype | 6294 + null-body | 3509 + petabox-error | 2388 + spn2-error | 1023 + spn2-error:job-failed | 462 + wayback-error | 347 + spn2-error:soft-time-limit-exceeded | 20 + bad-redirect | 11 + (17 rows) + +What top domains for those `no-pdf-link` (or similar)? + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 56488 + figshare.com | no-pdf-link | 55337 + www.egms.de | redirect-loop | 22686 + zenodo.org | terminal-bad-status | 22128 + tandf.figshare.com | no-pdf-link | 20027 + springernature.figshare.com | no-pdf-link | 17181 + cairn.info | terminal-bad-status | 13836 + www.persee.fr | terminal-bad-status | 7565 + projecteuclid.org | link-loop | 7449 + www.cairn.info | no-pdf-link | 6992 + scialert.net | no-pdf-link | 6621 + www.cairn.info | link-loop | 5870 + utpjournals.press | no-pdf-link | 5772 + journals.openedition.org | redirect-loop | 5464 + www.egms.de | no-pdf-link | 5223 + archaeologydataservice.ac.uk | no-pdf-link | 4881 + rs.figshare.com | no-pdf-link | 4773 + www.degruyter.com | spn2-cdx-lookup-failure | 4763 + koreascience.or.kr | no-pdf-link | 4487 + cancerres.aacrjournals.org | no-pdf-link | 4124 + cms.math.ca | no-pdf-link | 3441 + volcano.si.edu | no-pdf-link | 3424 + www.mathnet.ru | no-pdf-link | 3229 + tidsskriftet.no | no-pdf-link | 3012 + journals.plos.org | no-pdf-link | 3005 + tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796 + www.cairn.info:80 | link-loop | 2647 + hammer.figshare.com | no-pdf-link | 2627 + www.psychosocial.com | no-pdf-link | 2457 + osf.io | terminal-bad-status | 2388 + (30 rows) + +Should look at link extraction for: + +- scialert.net +- utpjournals.press +- koreascience.or.kr +- cancerres.aacrjournals.org +- cms.math.ca +- volcano.si.edu +- www.mathnet.ru +- www.psychosocial.com + +## Re-Ingest + +Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + AND ingest_file_result.status = 'no-capture' + -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json'; + => COPY 579952 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json + => 579k 0:00:22 [25.9k/s] + + cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Resuming progress on this in early December 2020. + +Filtered requests to re-crawl: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20') + OR (ingest_file_result.updated >= '2020-10-11')) + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json'; + => COPY 2352614 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json + +And actually dump seedlist(s): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + + wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt + 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + +Top DOI prefixes (same old usual suspects): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20 + 353695 10.5281 zenodo.org + 121888 10.6084 figshare.org + 115093 10.3917 cairn.info + 113252 10.3406 persee.fr + 95414 10.1515 degruyter.com + 90448 10.4324 taylorfrancis.com + 83927 10.1016 elsevier + 60303 10.1109 IEEE + 48490 10.4000 openedition.org + 28498 10.3205 egms.de + 23433 10.1163 brill.com + 23276 10.17615 cdr.lib.unc.edu + 21386 10.1093 oup.com + 20783 10.3138 utpjournals.press + 19987 10.1201 tandfonline.com + 17916 10.34847 cocoon.huma-num.fr + 16970 10.1002 wiley.com + 15958 10.1097 lww.com (and others?) + 15835 10.1017 cambridge.org + 15466 10.24355 publikationsserver.tu-braunschweig.de (IR) + +Top domains (not doi.org): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 104148 zenodo.org + 85245 www.persee.fr + 52931 www.cairn.info + 4791 www.jstage.jst.go.jp + 4411 archive.monthlyreview.org + 4129 osf.io + 2841 www.indianjournals.com + 2746 www.impan.pl + 2620 platform.almanhal.com + 2019 www.nomos-elibrary.de + 1209 dergipark.org.tr + 1027 pubs.geoscienceworld.org + 973 www.pdcnet.org + 923 www.hanspub.org + 914 www.repository.cam.ac.uk + 863 mediarep.org + 812 www.cartographicperspectives.org + 687 www.degruyter.com + 578 192.168.7.24 + 566 journals.eco-vector.com + +TODO: infer `publisher_type` and platform from DOI prefix in more cases + +## Re-Ingest + +Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3 +million requests. Note these are all `pdf` requests, but crawl was done in an +HTML-friendly way, so should be able to do domain/journal-specific HTML ingests +in the future. + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Stats, for this ingest period (fuzzy; will have some daily ingest stuff): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-12-28' + AND ingest_request.created <= '2020-12-09' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -----------------------+-------- + no-pdf-link | 962714 + success | 539305 + no-capture | 306590 + redirect-loop | 192149 + link-loop | 184797 + terminal-bad-status | 141721 + wrong-mimetype | 10362 + null-body | 10277 + skip-url-blocklist | 1985 + wayback-content-error | 1300 + cdx-error | 869 + petabox-error | 160 + bad-redirect | 72 + wayback-error | 46 + bad-gzip-encoding | 7 + timeout | 1 + max-hops-exceeded | 1 + (17 rows) + diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md new file mode 100644 index 0000000..ec4e536 --- /dev/null +++ b/notes/ingest/2020-09_reingest.md @@ -0,0 +1,197 @@ + +Goal: re-bulk-ingest some older existing crawls which hung on errors like +`cdx-error` or `wayback-error`, indicating that ingest might actually succeed +on retry. + +Sources: +- unpaywall (again) +- doi (ingest, changelog, etc) +- mag +- oai + +## DOI + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 25; + + status | count + -------------------------------------+--------- + no-pdf-link | 8304582 + success | 3461708 + no-capture | 1881269 + redirect-loop | 1851541 + gateway-timeout | 355820 + cdx-error | 341848 + terminal-bad-status | 328650 + skip-url-blocklist | 220474 + spn2-cdx-lookup-failure | 125521 + link-loop | 109352 + wayback-error | 101525 + null-body | 73539 + wrong-mimetype | 53151 + spn-error | 13579 + spn2-error | 6848 + spn2-error:job-failed | 4381 + spn-remote-error | 4180 + other-mimetype | 2305 + petabox-error | 904 + timeout | 710 + spn2-error:soft-time-limit-exceeded | 557 + spn2-error:proxy-error | 437 + spn2-error:browser-running-error | 273 + invalid-host-resolution | 233 + pending | 116 + (25 rows) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json'; + => 443421 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +Additional 27,779 success status? Hard to tell because lots of other ingest +running in parallel. + +Live: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'spn-error' OR + ingest_file_result.status = 'spn2-cdx-lookup-failure' OR + ingest_file_result.status = 'spn2-error:job-failed' OR + ingest_file_result.status = 'spn2-error:proxy-error' + ) + ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json'; + => 143984 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Unpaywall (again) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json'; + => 43912 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## MAG + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json'; + => 188175 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## OAI-PMH + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json'; + => 851056 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +--------- + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md new file mode 100644 index 0000000..4ec6fbd --- /dev/null +++ b/notes/ingest/2020-09_scielo.md @@ -0,0 +1,21 @@ + +As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing +fatcat releases with no IA copy and with `publisher_type:scielo`. There are +200k+ such releases. + +It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008 + +Could try XML ingest of these! + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json + Expecting 212529 release objects in search queries + +Enqueue + + cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done 2020-09-14 + diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md new file mode 100644 index 0000000..d2bb50b --- /dev/null +++ b/notes/ingest/2020-10_daily.md @@ -0,0 +1,193 @@ + +Quick notes on how daily ingest is going, circa September/October 2020. + + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) + ORDER BY date(ingest_request.created) DESC; + + ingest_type | date | total | success + -------------+------------+-------+--------- + pdf | 2020-10-10 | 6145 | 1368 + pdf | 2020-10-09 | 28453 | 6461 + pdf | 2020-10-08 | 15105 | 3803 + pdf | 2020-10-07 | 34213 | 10813 + pdf | 2020-10-06 | 22263 | 8565 + pdf | 2020-10-05 | 7910 | 3200 + pdf | 2020-10-04 | 10865 | 4579 + pdf | 2020-10-03 | 27745 | 10818 + pdf | 2020-10-02 | 34320 | 13523 + pdf | 2020-10-01 | 32548 | 13252 + pdf | 2020-09-30 | 34798 | 14113 + pdf | 2020-09-29 | 22463 | 8328 + pdf | 2020-09-28 | 4117 | 1278 + pdf | 2020-09-27 | 5894 | 1732 + pdf | 2020-09-26 | 34949 | 13901 + pdf | 2020-09-25 | 33680 | 10605 + pdf | 2020-09-24 | 15125 | 5785 + pdf | 2020-09-23 | 20866 | 6584 + pdf | 2020-09-22 | 20949 | 7167 + pdf | 2020-09-21 | 22483 | 7308 + pdf | 2020-09-20 | 45644 | 16981 + pdf | 2020-09-19 | 95571 | 31991 + pdf | 2020-09-18 | 50849 | 15875 + pdf | 2020-09-17 | 20121 | 3158 + pdf | 2020-09-16 | 39184 | 12150 + pdf | 2020-09-15 | 16986 | 7705 + (26 rows) + + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------------------+-------- + pdf | success | 241047 + pdf | no-pdf-link | 143084 + pdf | spn2-cdx-lookup-failure | 108311 + pdf | gateway-timeout | 97250 + pdf | cdx-error | 61820 + pdf | link-loop | 31350 + pdf | wayback-error | 9139 + pdf | spn2-error:job-failed | 4240 + pdf | spn2-error | 3893 + pdf | wrong-mimetype | 1010 + pdf | no-capture | 851 + pdf | null-body | 605 + pdf | redirect-loop | 261 + pdf | spn2-error:soft-time-limit-exceeded | 126 + pdf | terminal-bad-status | 120 + pdf | petabox-error | 105 + pdf | timeout | 29 + pdf | spn2-error:no-status | 2 + pdf | spn2-error:invalid-server-response | 2 + pdf | bad-gzip-encoding | 1 + (20 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 52767 + www.degruyter.com | link-loop | 17666 + www.degruyter.com | spn2-cdx-lookup-failure | 17597 + ieeexplore.ieee.org | gateway-timeout | 15290 + www.sciencedirect.com | no-pdf-link | 14043 + apps.crossref.org | no-pdf-link | 11531 + figshare.com | no-pdf-link | 8966 + tandf.figshare.com | no-pdf-link | 7276 + zenodo.org | no-capture | 7191 + springernature.figshare.com | no-pdf-link | 6485 + www.taylorfrancis.com | link-loop | 6266 + www.persee.fr | terminal-bad-status | 6031 + journals.openedition.org | gateway-timeout | 5639 + www.cairn.info | link-loop | 5618 + archaeologydataservice.ac.uk | no-pdf-link | 5359 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748 + www.e-periodica.ch | no-pdf-link | 4722 + osf.io | no-capture | 4247 + cancerres.aacrjournals.org | no-pdf-link | 4136 + dlc.library.columbia.edu | no-pdf-link | 4085 + www.egms.de | no-pdf-link | 3304 + journals.lww.com | no-pdf-link | 3218 + journals.plos.org | no-pdf-link | 3005 + linkinghub.elsevier.com | gateway-timeout | 2833 + www.egms.de | redirect-loop | 2606 + (25 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------------------+---------+------- + zenodo.org | success | 55549 + arxiv.org | success | 24450 + s3-eu-west-1.amazonaws.com | success | 18156 + res.mdpi.com | success | 13493 + www.degruyter.com | success | 12009 + journals.openedition.org | success | 11235 + www.jstage.jst.go.jp | success | 9460 + peer.asee.org | success | 9416 + www.e-periodica.ch | success | 8105 + ir.canterbury.ac.nz | success | 6381 + europepmc.org | success | 5670 + www.repository.cam.ac.uk | success | 4858 + assets.researchsquare.com | success | 4765 + fjfsdata01prod.blob.core.windows.net | success | 4130 + tidsskrift.dk | success | 3964 + research-journal.org | success | 3127 + ieeexplore.ieee.org | success | 2947 + dergipark.org.tr | success | 2892 + watermark.silverchair.com | success | 2315 + journals.plos.org | success | 2304 + journal.fi | success | 1996 + publications.rwth-aachen.de | success | 1954 + www.brazilianjournals.com | success | 1637 + article.sciencepublishinggroup.com | success | 1589 + revistas.upr.edu | success | 1467 + (25 rows) + +Casual take-aways: +- wonder what `apps.crossref.org` is +- sciencedirect crawling broken? +- figshare might be broken? or just very little success +- seems like a lot of journals.plos.org failures diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md new file mode 100644 index 0000000..a991025 --- /dev/null +++ b/notes/ingest/2020-10_unpaywall.md @@ -0,0 +1,286 @@ + +New snapshot released 2020-10-09. Want to do a mostly straight-forward +load/ingest/crawl. + +Proposed changes this time around: + +- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture` + status, and to include those URLs in heritrix3 crawl +- tweak heritrix3 config for additional PDF URL extraction patterns, + particularly to improve OJS yield + + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json + => 28.3M 3:19:03 [2.37k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 28.3M 1:11:29 [ 6.6k/s] + => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2020-10-09' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + => COPY 4216339 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + => 4.22M 0:02:48 [ 25k/s] + +Start small, to test no-capture behavior: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +`no-capture` change looks good. Enqueue the whole batch: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 23661282 + no-capture | 3015447 + no-pdf-link | 2302102 + redirect-loop | 1542566 + terminal-bad-status | 1044676 + wrong-mimetype | 114315 + link-loop | 36358 + cdx-error | 20150 + null-body | 14513 + wayback-error | 13644 + gateway-timeout | 3776 + spn2-cdx-lookup-failure | 1260 + petabox-error | 1171 + redirects-exceeded | 752 + invalid-host-resolution | 464 + spn2-error | 147 + bad-redirect | 131 + spn2-error:job-failed | 91 + wayback-content-error | 45 + timeout | 19 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + ) t1 + ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json'; + => 2,936,404 + + # TODO: in the future also exclude "www.archive.org" + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json + +And actually dump seedlist(s): + + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt + + wc -l unpaywall_seedlist_2020-11-02.*.txt + 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt + 2713866 unpaywall_seedlist_2020-11-02.url.txt + +With things like jsessionid, suspect that crawling just the terminal URLs is +going to work better than both full and terminal. + +Finding a fraction of `no-capture` which have partial/stub URLs as terminal. + +TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP). + + +## Bulk Ingest and Status + +Note, removing archive.org links: + + cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Overall status (checked 2020-12-08): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 25004559 + no-pdf-link | 2531841 + redirect-loop | 1671375 + terminal-bad-status | 1389463 + no-capture | 893880 + wrong-mimetype | 119332 + link-loop | 66508 + wayback-content-error | 30339 + cdx-error | 21790 + null-body | 20710 + wayback-error | 13976 + gateway-timeout | 3775 + petabox-error | 2420 + spn2-cdx-lookup-failure | 1218 + redirects-exceeded | 889 + invalid-host-resolution | 464 + bad-redirect | 147 + spn2-error | 112 + spn2-error:job-failed | 91 + timeout | 21 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1101090 + accepted | no-pdf-link | 28590 + accepted | redirect-loop | 10923 + accepted | no-capture | 9540 + accepted | terminal-bad-status | 6339 + accepted | cdx-error | 952 + accepted | wrong-mimetype | 447 + accepted | link-loop | 275 + accepted | wayback-error | 202 + accepted | petabox-error | 177 + accepted | redirects-exceeded | 122 + accepted | null-body | 27 + accepted | wayback-content-error | 14 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | bad-redirect | 1 + published | success | 18595278 + published | no-pdf-link | 2434935 + published | redirect-loop | 1364110 + published | terminal-bad-status | 1185328 + published | no-capture | 718792 + published | wrong-mimetype | 112923 + published | link-loop | 63874 + published | wayback-content-error | 30268 + published | cdx-error | 17302 + published | null-body | 15209 + published | wayback-error | 10782 + published | gateway-timeout | 1966 + published | petabox-error | 1611 + published | spn2-cdx-lookup-failure | 879 + published | redirects-exceeded | 760 + published | invalid-host-resolution | 453 + published | bad-redirect | 115 + published | spn2-error:job-failed | 77 + published | spn2-error | 75 + published | timeout | 21 + published | bad-gzip-encoding | 5 + published | spn2-error:soft-time-limit-exceeded | 4 + published | spn2-error:pending | 1 + published | blocked-cookie | 1 + published | | 1 + published | pending | 1 + submitted | success | 5308166 + submitted | redirect-loop | 296322 + submitted | terminal-bad-status | 197785 + submitted | no-capture | 165545 + submitted | no-pdf-link | 68274 + submitted | wrong-mimetype | 5962 + submitted | null-body | 5474 + submitted | cdx-error | 3536 + submitted | wayback-error | 2992 + submitted | link-loop | 2359 + submitted | gateway-timeout | 1805 + submitted | petabox-error | 632 + submitted | spn2-cdx-lookup-failure | 334 + submitted | wayback-content-error | 57 + submitted | spn2-error | 37 + submitted | bad-redirect | 31 + submitted | spn2-error:job-failed | 14 + submitted | | 12 + submitted | invalid-host-resolution | 11 + submitted | redirects-exceeded | 7 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | bad-gzip-encoding | 1 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 11 + | no-capture | 3 + (70 rows) diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md new file mode 100644 index 0000000..f9abe09 --- /dev/null +++ b/notes/ingest/2020-11-04_arxiv.md @@ -0,0 +1,12 @@ + +Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run +a crawl. + +Crawl is now done, so going to ingest, hoping to get the majority of the +millions of remaining arxiv.org PDFs. + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l + => 1,288,559 + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md new file mode 100644 index 0000000..473dd0d --- /dev/null +++ b/notes/ingest/2020-11_doaj.md @@ -0,0 +1,295 @@ + +This is the first ingest (and crawl) of URLs from DOAJ article-level metadata. +It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in +the past. + +Working off a 2020-11-13 snapshot. + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:28 [4.57k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => ran in to error with blank `base_url` + +Second try after patches: + + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:29 [4.56k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036}) + +## Check Pre-Crawl Status + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-------------------------+--------- + pdf | | 3711532 + html | | 2429003 + pdf | success | 454403 + pdf | redirect-loop | 48587 + pdf | no-pdf-link | 24901 + pdf | no-capture | 11569 + xml | | 9442 + pdf | link-loop | 8466 + pdf | terminal-bad-status | 2015 + pdf | wrong-mimetype | 1441 + pdf | null-body | 1057 + pdf | petabox-error | 299 + pdf | cdx-error | 124 + pdf | gateway-timeout | 114 + pdf | wayback-error | 77 + pdf | spn2-cdx-lookup-failure | 20 + pdf | invalid-host-resolution | 4 + pdf | spn2-error | 1 + (18 rows) + +## Dump new URLs, Transform, Bulk Ingest (PDF and XML only) + +Dump: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.link_source = 'doaj' + -- AND date(ingest_request.created) > '2020-12-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json'; + => COPY 3732543 + +Transform: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json + => 3.73M 0:02:18 [26.9k/s] + +Definitely some non-URL strings in there; should try to filter those out +earlier in the transform process. And/or have a constraint on the URL column in +the database. + +Enqueue the whole batch: + + cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started this batch off at 2020-11-19 18:10 (Pacific time) + +Stats after run: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 30; + +## Dump Seedlist + +After preliminary bulk ingest attempts, dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json'; + => 1,899,555 + +TODO: filter for valid URLs + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json + +And actually dump seedlist(s): + + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt + + wc -l doaj_seedlist_2020-11-19.*.txt + +## Post-Crawl Ingest + +Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ +identifiers are all in fatcat: + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + # started 2020-12-23 15:05 (Pacific) + # finished around 2020-12-31, after one long/slow partition + +Stats again after everything: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+--------------------------+--------- + html | wrong-scope | 1089423 + html | no-capture | 423917 + html | redirect-loop | 212910 + html | unknown-scope | 204069 + html | html-resource-no-capture | 165587 + html | success | 122937 + html | null-body | 100296 + html | wayback-content-error | 53918 + html | wrong-mimetype | 18908 + html | terminal-bad-status | 14059 + html | petabox-error | 13520 + html | cdx-error | 6823 + html | wayback-error | 890 + html | | 620 + html | blocked-cookie | 543 + html | blocked-captcha | 250 + html | redirects-exceeded | 135 + html | too-many-resources | 111 + html | max-hops-exceeded | 84 + html | bad-redirect | 3 + pdf | success | 2851324 + pdf | no-pdf-link | 529914 + pdf | redirect-loop | 349494 + pdf | no-capture | 272202 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91796 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2968 + pdf | | 2068 + pdf | wayback-content-error | 1548 + pdf | cdx-error | 1095 + pdf | petabox-error | 1024 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | timeout | 20 + pdf | max-hops-exceeded | 19 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + xml | cdx-error | 3 + (43 rows) + + +And on filtered subset that we actually crawled: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+-----------------------+--------- + pdf | success | 2851286 + pdf | no-pdf-link | 527495 + pdf | redirect-loop | 345138 + pdf | no-capture | 268140 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91125 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2907 + pdf | petabox-error | 363 + pdf | wayback-content-error | 242 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | max-hops-exceeded | 19 + pdf | cdx-error | 15 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + (20 rows) + diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md new file mode 100644 index 0000000..5979753 --- /dev/null +++ b/notes/ingest/2020-12-08_patch_crawl_notes.md @@ -0,0 +1,111 @@ + +Notes here about re-ingesting or re-crawling large batches. Goal around end of +2020 is to generate a broad patch crawl of terminal no-capture attempts for all +major sources crawled thus far. Have already tried run this process for unpaywall. + +For each, want filtered ingest request JSON objects (filtering out platforms +that don't crawl well, and possibly things like figshare+zenodo), and a broader +seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a +heritrix crawl with new config, then re-ingest all the requests individually. + +Summary of what to do here: + + OA DOI: expecting some 2.4 million seeds + OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found + Unpaywall: another ~900k no-capture URLs (maybe filtered?) + +For all, re-attempt for these status codes: + + no-capture + cdx-error + wayback-error + petabox-error + gateway-timeout (?) + +And at least do bulk re-ingest for these, if updated before 2020-11-20 or so: + + no-pdf-link + +## OAI-PMH + +Need to re-ingest all of the (many!) no-capture and no-pdf-link + +TODO: repec-specific URL extraction? + +Skip these OAI prefixes: + + kb.dk + bnf.fr + hispana.mcu.es + bdr.oai.bsb-muenchen.de + ukm.si + hsp.org + +Skip these domains: + + www.kb.dk (kb.dk) + kb-images.kb.dk (kb.dk) + mdz-nbn-resolving.de (TODO: what prefix?) + aggr.ukm.um.si (ukm.si) + +Check PDF link extraction for these prefixes, or skip them (TODO): + + repec (mixed success) + biodiversitylibrary.org + juser.fz-juelich.de + americanae.aecid.es + www.irgrid.ac.cn + hal + espace.library.uq.edu.au + igi.indrastra.com + invenio.nusl.cz + hypotheses.org + t2r2.star.titech.ac.jp + quod.lib.umich.edu + + domain: hemerotecadigital.bne.es + domain: bib-pubdb1.desy.de + domain: publikationen.bibliothek.kit.edu + domain: edoc.mpg.de + domain: bibliotecadigital.jcyl.es + domain: lup.lub.lu.se + domain: orbi.uliege.be + +TODO: +- consider deleting ingest requests from skipped prefixes (large database use) + + +## Unpaywall + +About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`. + +Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) < '2020-11-20' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json'; + => COPY 1309990 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json + => 1.31M 0:00:51 [25.6k/s] + + cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md new file mode 100644 index 0000000..d7643f4 --- /dev/null +++ b/notes/ingest/2021-04_unpaywall.md @@ -0,0 +1,368 @@ + +New snapshot released 2021-02-18, finally getting around to a crawl two months +later. + +Intend to do same style of crawl as in the past. One change is that +sandcrawler-db has moved to a focal VM. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json + => 30.0M 3:14:59 [2.57k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json'; + => COPY 3277484 + + # previous, 2020-10 run: COPY 4216339 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json + => 3.28M 0:01:42 [32.1k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 26385866 + no-pdf-link | 2132565 + no-capture | 2092111 + redirect-loop | 1732543 + terminal-bad-status | 1504555 + wayback-content-error | 357345 + wrong-mimetype | 126070 + link-loop | 76808 + cdx-error | 22756 + null-body | 22066 + wayback-error | 13768 + gateway-timeout | 3804 + petabox-error | 3608 + spn2-cdx-lookup-failure | 1225 + redirects-exceeded | 892 + invalid-host-resolution | 505 + bad-redirect | 151 + spn2-error | 108 + spn2-error:job-failed | 91 + bad-gzip-encoding | 27 + (20 rows) + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-01-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 1348623 + no-capture | 1231582 + redirect-loop | 45622 + no-pdf-link | 37312 + terminal-bad-status | 24162 + wrong-mimetype | 6684 + link-loop | 5757 + null-body | 1288 + wayback-content-error | 1123 + cdx-error | 831 + petabox-error | 697 + wayback-error | 185 + invalid-host-resolution | 41 + gateway-timeout | 29 + blocked-cookie | 22 + bad-gzip-encoding | 20 + spn2-cdx-lookup-failure | 7 + bad-redirect | 4 + timeout | 3 + redirects-exceeded | 3 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json'; + => 2020-10: 2,936,404 + => 2021-04: 1,805,192 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json + => 1.81M 0:01:27 [20.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt + 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1,804,211 consumer group lag + +## Post-Ingest Stats + +Overall status (unpaywall, all time): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 27242251 + no-pdf-link | 2746237 + redirect-loop | 1821132 + terminal-bad-status | 1553441 + no-capture | 478559 + wayback-content-error | 357390 + wrong-mimetype | 127365 + link-loop | 79389 + cdx-error | 23170 + null-body | 23169 + wayback-error | 13704 + gateway-timeout | 3803 + petabox-error | 3642 + redirects-exceeded | 1427 + spn2-cdx-lookup-failure | 1214 + invalid-host-resolution | 505 + bad-redirect | 153 + spn2-error | 107 + spn2-error:job-failed | 91 + body-too-large | 84 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1213335 + accepted | no-pdf-link | 29292 + accepted | redirect-loop | 12769 + accepted | terminal-bad-status | 11264 + accepted | no-capture | 10187 + accepted | cdx-error | 1015 + accepted | wayback-content-error | 757 + accepted | wrong-mimetype | 501 + accepted | link-loop | 407 + accepted | wayback-error | 207 + accepted | petabox-error | 189 + accepted | redirects-exceeded | 125 + accepted | null-body | 34 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | blocked-cookie | 2 + accepted | bad-redirect | 1 + accepted | body-too-large | 1 + published | success | 20196774 + published | no-pdf-link | 2647969 + published | redirect-loop | 1477558 + published | terminal-bad-status | 1320013 + published | wayback-content-error | 351931 + published | no-capture | 297603 + published | wrong-mimetype | 115440 + published | link-loop | 76431 + published | cdx-error | 18125 + published | null-body | 17559 + published | wayback-error | 10466 + published | petabox-error | 2684 + published | gateway-timeout | 1979 + published | redirects-exceeded | 947 + published | spn2-cdx-lookup-failure | 877 + published | invalid-host-resolution | 457 + published | bad-redirect | 120 + published | spn2-error:job-failed | 77 + published | spn2-error | 70 + published | body-too-large | 39 + published | bad-gzip-encoding | 24 + published | timeout | 24 + published | blocked-cookie | 23 + published | spn2-error:soft-time-limit-exceeded | 4 + published | | 2 + published | pending | 1 + published | spn2-error:pending | 1 + published | too-many-redirects | 1 + submitted | success | 5832117 + submitted | redirect-loop | 330785 + submitted | terminal-bad-status | 222152 + submitted | no-capture | 170766 + submitted | no-pdf-link | 68934 + submitted | wrong-mimetype | 11424 + submitted | null-body | 5576 + submitted | wayback-content-error | 4702 + submitted | cdx-error | 4030 + submitted | wayback-error | 3031 + submitted | link-loop | 2551 + submitted | gateway-timeout | 1820 + submitted | petabox-error | 769 + submitted | redirects-exceeded | 355 + submitted | spn2-cdx-lookup-failure | 332 + submitted | invalid-host-resolution | 48 + submitted | body-too-large | 44 + submitted | spn2-error | 37 + submitted | bad-redirect | 32 + submitted | spn2-error:job-failed | 14 + submitted | | 13 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | timeout | 4 + submitted | bad-gzip-encoding | 3 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 12 + | no-capture | 3 + (76 rows) + + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2192376 + no-capture | 152183 + no-pdf-link | 144174 + redirect-loop | 125988 + terminal-bad-status | 67307 + link-loop | 8292 + wrong-mimetype | 7942 + null-body | 2270 + cdx-error | 1223 + wayback-content-error | 1147 + petabox-error | 728 + wayback-error | 155 + body-too-large | 82 + invalid-host-resolution | 41 + gateway-timeout | 28 + blocked-cookie | 22 + bad-gzip-encoding | 20 + timeout | 7 + bad-redirect | 6 + redirects-exceeded | 4 + (20 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 2,703,999 raw ingest requests (new URLs total) +- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet +- 843,753 (31.2%) success from new heritrix crawling +- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md new file mode 100644 index 0000000..e8748fa --- /dev/null +++ b/notes/ingest/2021-05_daily_improvements.md @@ -0,0 +1,480 @@ + +Summary of top large broken domains (2021-04-21 "30 day" snapshot): + +## acervus.unicamp.br + + domain | status | count +---------------------------------------+-------------------------+-------- + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5; + +http://acervus.unicamp.br/index.asp?codigo_sophia=963332 + +seems like many of these were captures with a blank page? or a redirect to +the homepage? + +http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html + +messy, going to move on. + + +## apex.ipk-gatersleben.de + +apex.ipk-gatersleben.de | | 1253 +apex.ipk-gatersleben.de | no-pdf-link | 1132 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5; + +https://doi.org/10.25642/ipk/rescoll/4886 +https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331 + +seem to be datasets/species, not articles. + +prefix: 10.25642/ipk + +## crossref.org + + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + +https://doi.org/10.1515/9781501747045-013 +https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013 + +Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML. + +## openeditiong + + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + +https://doi.org/10.4000/books.pul.34492 +https://books.openedition.org/pul/34492 + +these are not actually OA books (or at least, not all are) + +## chemrxiv.org (figshare) + + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + +https://doi.org/10.26434/chemrxiv.14411081 +https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081 + +these all seem to be *multi-file* entities, thus not good for single file ingest pipeline. + +## direct.mit.edu + + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + +https://doi.org/10.7551/mitpress/14056.003.0004 +https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext + +"not available" + +https://doi.org/10.7551/mitpress/12444.003.0004 + +"not available" + + +## dlc.library.columbia.edu + + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + +https://doi.org/10.7916/d8-506w-kk49 +https://dlc.library.columbia.edu/durst/cul:18931zcrk9 + +document repository. +this one goes to IA! actually many seem to. +added extractor, should re-ingest with: + + publisher:"Columbia University" doi_prefix:10.7916 !journal:* + +actually, that is like 600k+ results and many are not digitized, so perhaps not. + +## doi.ala.org.au + + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + +https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052 +https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052 + +this is a data repository, with filesets, not papers. datacite metadata is +incorrect. + +## fldeploc.dep.state.fl.us + + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + + +https://doi.org/10.35256/ic29 +http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29 + +re-ingest with: + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + +## geoscan.nrcan.gc.ca + + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + +https://doi.org/10.4095/295366 +https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366 + +this is a geographic repository, not papers. + +## kiss.kstudy.com + + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + +https://doi.org/10.22143/hss21.12.1.121 +http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523 + +Korean. seems to not actually be theses? can't download. + +## linkinghub.elsevier.com + + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + +skipping for now, looks like mostly 'forbidden'? + +## osf.io + +These are important! + + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + +https://doi.org/10.31219/osf.io/jux3w +https://accounts.osf.io/login?service=https://osf.io/jux3w/download + +many of these are 404s by browser as well. what does that mean? + +## peerj.com + + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + +https://doi.org/10.7287/peerj.11155v0.1/reviews/2 +https://peerj.com/articles/11155/reviews/ + +these are HTML reviews, not papers + +## preprints.jmir.org + + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + +https://doi.org/10.2196/preprints.22556 +https://preprints.jmir.org/preprint/22556 + +UGH, looks simple, but javascript. + +could try to re-write URL into S3 format? meh. + +## psyarxiv.com (OSF?) + + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + +https://doi.org/10.31234/osf.io/5jaqg +https://psyarxiv.com/5jaqg/ + +Also infuriatingly Javascript, but can do URL hack. + +Should reingest, and potentially force-recrawl: + + # about 67k + publisher:"Center for Open Science" in_ia:false + +## publons.com + + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + +https://doi.org/10.1002/jmor.21338/v2/review1 +https://publons.com/publon/40260824/ + +These are just HTML reviews, not papers. + +## saemobilus.sae.org + + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + +https://doi.org/10.4271/as1426c +https://saemobilus.sae.org/content/as1426c + +These seem to be standards, and are not open access (paywall) + +## scholar.dkyobobook.co.kr + + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + +https://doi.org/10.22471/crisis.2021.6.1.18 +http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536 + +Korean. complex javascript, skipping. + +## unreserved.rba.gov.au + + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + +https://doi.org/10.47688/rba_archives_2006/04129 +https://unreserved.rba.gov.au/users/login + +Don't need to login when I tried in browser? document repo, not papers. + +## wayf.switch.ch + + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + +https://doi.org/10.24451/arbor.11128 +https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022 + +Loginwall + +## + + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + +https://doi.org/10.5040/9781849664264.0008 +https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries + +These are primarily not OA/available. + +## + + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + +https://doi.org/10.22443/rms.emc2020.146 +https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html + +These are just abstracts, not papers. + +## Emerald + + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + +https://doi.org/10.1108/ramj-11-2020-0065 +https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html + +Note that these URLs are already HTML fulltext. but the PDF is also available and easy. + +re-ingest: + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + +## + + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + +https://doi.org/10.5040/9781718206625.ch-002 +https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2 + +paywall + +## + + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + +https://doi.org/10.1504/ijdmb.2020.10036342 +https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb + +paywall + +## + + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + +https://doi.org/10.15258/sst.2021.49.1.07 +https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02 + +Annoying javascript, but easy to work around. + +re-ingest: + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + +## + + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + +https://doi.org/10.5771/9783748907084-439 +https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren + +Javascript obfuscated download button? + +## + + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + +https://doi.org/10.1787/543e84ed-en +https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en + +Paywall. + +## + + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + +https://doi.org/10.1364/boe.422199 +https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199 + +Some of these are "pre-registered" DOIs, not published yet. Many of the +remaining are actually HTML articles, and/or have some stuff in the +`citation_pdf_url`. A core problem is captchas. + +Have started adding support to fatcat for HTML crawl type based on container. + +re-ingest: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + +## + + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + +https://doi.org/10.1093/oseo/instance.00266789 +https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27 + +loginwall/paywall + +## + + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + +https://doi.org/10.1127/zfg/40/1996/461 +https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref + +paywall + +## + + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + +https://doi.org/10.1016/j.landurbplan.2021.104104 +https://www.sciencedirect.com/science/article/pii/S0169204621000670 + +Bunch of crazy new hacks, but seems to be working! + +re-ingest: + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2021 + +## + + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + +https://doi.org/10.2478/awutm-2019-0012 +https://www.sciendo.com/article/10.2478/awutm-2019-0012 + +uses lots of javascript, hard to scrape. + + +## Others (for reference) + + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 + +## Re-ingests + +All the above combined: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u + => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie + => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864}) + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida" + => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843}) + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald" + => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812}) + + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018" + => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140}) + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2020 + doi_prefix:10.1016 is_oa:true year:2021 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020" + => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021" + => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824}) + + pmcid:* year:2018 + pmcid:* year:2019 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018" + => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019" + => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658}) + diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md new file mode 100644 index 0000000..8b6ac09 --- /dev/null +++ b/notes/ingest/2021-07_unpaywall.md @@ -0,0 +1,320 @@ + +New snapshot released 2021-07-02. Should be "boring" ingest and crawl. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json + => 32.2M 3:01:52 [2.95k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260}) + + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + => COPY 3556146 + + # previous, 2020-10 run: COPY 4216339 + # previous, 2021-07 run: COPY 3277484 + +Oops, should have run instead, with the date filter: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + +But didn't, so processed all instead. + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json + => 3.56M 0:01:59 [29.8k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done, on 2021-07-13 + + +## Check Pre-Crawl Status + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 1831827 + success | 1343604 + redirect-loop | 103999 + terminal-bad-status | 19845 + no-pdf-link | 17448 + link-loop | 5027 + wrong-mimetype | 2270 + cdx-error | 523 + body-too-large | 321 + null-body | 298 + wayback-content-error | 242 + petabox-error | 155 + gateway-timeout | 138 + invalid-host-resolution | 120 + wayback-error | 109 + blocked-cookie | 9 + timeout | 7 + | 3 + bad-redirect | 3 + spn2-cdx-lookup-failure | 3 + (20 rows) + + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND date(ingest_request.created) > '2021-07-01' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json'; + => COPY 1743186 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json + => 1.74M 0:01:33 [18.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt + 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + 3287992 total + +Then run crawl (see `journal-crawls` git repo). + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1.74M 0:01:59 [14.6k/s] + +## Post-Ingest Stats + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2690258 + redirect-loop | 227328 + no-capture | 157368 + terminal-bad-status | 118943 + no-pdf-link | 92698 + blocked-cookie | 19478 + link-loop | 9249 + wrong-mimetype | 4918 + cdx-error | 1786 + wayback-error | 1497 + null-body | 1302 + body-too-large | 433 + wayback-content-error | 245 + petabox-error | 171 + gateway-timeout | 138 + invalid-host-resolution | 120 + timeout | 12 + bad-redirect | 4 + | 3 + spn2-cdx-lookup-failure | 1 + (20 rows) + +Only the recent updates, by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------+--------- + accepted | success | 103144 + accepted | no-pdf-link | 53981 + accepted | terminal-bad-status | 4102 + accepted | link-loop | 2799 + accepted | no-capture | 2315 + accepted | redirect-loop | 2171 + accepted | blocked-cookie | 234 + accepted | cdx-error | 140 + accepted | wayback-error | 101 + accepted | wrong-mimetype | 38 + accepted | null-body | 10 + accepted | petabox-error | 5 + accepted | wayback-content-error | 4 + accepted | gateway-timeout | 2 + accepted | body-too-large | 2 + published | success | 1919100 + published | no-capture | 130104 + published | redirect-loop | 127482 + published | terminal-bad-status | 43118 + published | no-pdf-link | 33505 + published | blocked-cookie | 19034 + published | link-loop | 6241 + published | wrong-mimetype | 4163 + published | null-body | 1195 + published | cdx-error | 1151 + published | wayback-error | 1105 + published | wayback-content-error | 197 + published | body-too-large | 195 + published | petabox-error | 118 + published | gateway-timeout | 35 + published | invalid-host-resolution | 13 + published | timeout | 8 + published | bad-redirect | 2 + published | spn2-cdx-lookup-failure | 1 + published | bad-gzip-encoding | 1 + submitted | success | 668014 + submitted | redirect-loop | 97675 + submitted | terminal-bad-status | 71723 + submitted | no-capture | 24949 + submitted | no-pdf-link | 5212 + submitted | wrong-mimetype | 717 + submitted | cdx-error | 495 + submitted | wayback-error | 291 + submitted | body-too-large | 236 + submitted | blocked-cookie | 210 + submitted | link-loop | 209 + submitted | invalid-host-resolution | 107 + submitted | gateway-timeout | 101 + submitted | null-body | 97 + submitted | petabox-error | 48 + submitted | wayback-content-error | 44 + submitted | timeout | 4 + submitted | | 3 + submitted | bad-redirect | 2 + submitted | remote-server-error | 1 + (55 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 3,325,954 raw ingest requests (new URLs total) +- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl +- 1,346,654 (77% of crawled) success from new heritrix crawling +- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) + +## Live Ingest Follow-Up + +Will run SPN requests on the ~160k `no-capture` URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json'; + => COPY 157371 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json + => 157k 0:00:04 [31.6k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md new file mode 100644 index 0000000..5f92196 --- /dev/null +++ b/notes/ingest/2021-08_mag.md @@ -0,0 +1,400 @@ + +Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest. +Also want to re-ingest some old/failed ingests, now that pipeline/code has +improved. + +Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs. + + +## Persist Ingest Requests + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000}) + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request - + => 22.5M 0:46:00 [8.16k/s] + => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585}) + +Roughly 8.6 million new URLs + +## Pre-Crawl Status Counts + +Status of combined old and new requests, with some large domains removed: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + success | 26123975 + | 6664846 + no-pdf-link | 1859908 + redirect-loop | 1532405 + no-capture | 1199126 + link-loop | 1157010 + terminal-bad-status | 832362 + gateway-timeout | 202158 + spn2-cdx-lookup-failure | 81406 + wrong-mimetype | 69087 + invalid-host-resolution | 37262 + wayback-error | 21340 + petabox-error | 11237 + null-body | 9414 + wayback-content-error | 2199 + cdx-error | 1893 + spn2-error | 1741 + spn2-error:job-failed | 971 + blocked-cookie | 902 + spn2-error:invalid-url-syntax | 336 + (20 rows) + +And just the new URLs (note that domain filter shouldn't be required, but +keeping for consistency): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + | 6664780 + success | 1957844 + redirect-loop | 23357 + terminal-bad-status | 9385 + no-pdf-link | 8315 + no-capture | 6892 + link-loop | 4517 + wrong-mimetype | 3864 + cdx-error | 1749 + blocked-cookie | 842 + null-body | 747 + wayback-error | 688 + wayback-content-error | 570 + gateway-timeout | 367 + petabox-error | 340 + spn2-cdx-lookup-failure | 150 + read-timeout | 122 + not-found | 119 + invalid-host-resolution | 63 + spn2-error | 23 + (20 rows) + +## Dump Initial Bulk Ingest Requests + +Note that this is all-time, not just recent, and will re-process a lot of +"no-pdf-link": + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-pdf-link' + OR ingest_file_result.status = 'cdx-error' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json'; + => COPY 8526647 + +Transform to ingest requests: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json + => 8.53M 0:03:40 + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + +Updated stats after running initial bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 5184994 + no-capture | 3284416 + redirect-loop | 98685 + terminal-bad-status | 28733 + link-loop | 28518 + blocked-cookie | 22338 + no-pdf-link | 19073 + wrong-mimetype | 9122 + null-body | 2793 + wayback-error | 2128 + wayback-content-error | 1233 + cdx-error | 1198 + petabox-error | 617 + gateway-timeout | 395 + not-found | 130 + read-timeout | 128 + | 111 + invalid-host-resolution | 63 + spn2-cdx-lookup-failure | 24 + spn2-error | 20 + (20 rows) + +## Generate Seedlist + +For crawling, do a similar (but not identical) dump: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json'; + => COPY 4599519 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json + => 4.60M 0:02:55 [26.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + => DONE + + wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt + 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + +## Post-Crawl Bulk Re-Ingest + +Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by +hash, URL agnostic). + +Enqueue for buik re-ingest: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => Thu 19 Aug 2021 09:10:59 PM UTC + + +## Post-Ingest Stats + +Just the new stuff (compare against above for delta): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 7748241 89.2% + no-capture | 429688 4.9% + redirect-loop | 172831 2.0% + terminal-bad-status | 94029 1.1% + no-pdf-link | 86437 1.0% + blocked-cookie | 67903 0.8% + link-loop | 50622 + wrong-mimetype | 21064 + null-body | 6650 + cdx-error | 3313 + wayback-error | 2630 + gateway-timeout | 399 + petabox-error | 268 + wayback-content-error | 170 + not-found | 130 + read-timeout | 128 + | 109 + invalid-host-resolution | 63 + bad-redirect | 39 + spn2-error | 20 + (20 rows) + +New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397 + +Overall success of new batch: 7748241. / 8686315 = 89.2% + +And combined (old and new) status again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 31990062 + redirect-loop | 1704717 + no-capture | 1263462 + link-loop | 1218280 + blocked-cookie | 1213838 + no-pdf-link | 1096664 + terminal-bad-status | 960070 + gateway-timeout | 202190 + wrong-mimetype | 86557 + invalid-host-resolution | 37262 + null-body | 15443 + wayback-error | 12839 + cdx-error | 4047 + spn2-error | 1731 + spn2-error:job-failed | 962 + petabox-error | 463 + wayback-content-error | 379 + spn2-error:invalid-url-syntax | 336 + spn2-error:soft-time-limit-exceeded | 203 + | 175 + (20 rows) + +New success total: 31990062 - 26123975 = 5,866,087 + +A full 1,263,462 no-capture that could be attempted... though many of those may +be excluded for a specific reason. diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md new file mode 100644 index 0000000..fded7b3 --- /dev/null +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -0,0 +1,1578 @@ + +Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially +re-crawling content which failed to ingest the first time. + +May fold this in with more general patch crawling. + +## Basic Counts + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 14145387 + no-pdf-link | 12063022 + no-capture | 5485640 + redirect-loop | 2092705 + terminal-bad-status | 747372 + wrong-mimetype | 597219 + link-loop | 542144 + null-body | 93566 + cdx-error | 19798 + petabox-error | 17943 + | 15283 + wayback-error | 13897 + gateway-timeout | 511 + skip-url-blocklist | 184 + wayback-content-error | 146 + bad-redirect | 137 + redirects-exceeded | 120 + bad-gzip-encoding | 116 + timeout | 80 + blocked-cookie | 64 + (20 rows) + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 40; + + + oai_prefix | success | total + ---------------------------+---------+--------- + repec | 1133175 | 2783448 + hal | 573218 | 1049607 + www.irgrid.ac.cn | 18007 | 748828 + cds.cern.ch | 74078 | 688091 + americanae.aecid.es | 71310 | 572792 + juser.fz-juelich.de | 23026 | 518551 + espace.library.uq.edu.au | 6649 | 508960 + igi.indrastra.com | 59629 | 478577 + archive.ugent.be | 65306 | 424014 + hrcak.srce.hr | 404085 | 414897 + zir.nsk.hr | 156753 | 397200 + renati.sunedu.gob.pe | 79362 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7997 | 354529 + generic.eprints.org | 263566 | 340470 + invenio.nusl.cz | 6340 | 325867 + evastar-karlsruhe.de | 62282 | 317952 + quod.lib.umich.edu | 5 | 309135 + diva.org | 67917 | 298348 + t2r2.star.titech.ac.jp | 1085 | 289388 + edpsciences.org | 139495 | 284972 + repository.ust.hk | 10245 | 283417 + revues.org | 151156 | 277497 + pure.atira.dk | 13492 | 260754 + bibliotecadigital.jcyl.es | 50606 | 254134 + escholarship.org/ark | 140835 | 245203 + ojs.pkp.sfu.ca | 168029 | 229387 + lup.lub.lu.se | 49358 | 226602 + library.wur.nl | 15051 | 216738 + digitalrepository.unm.edu | 111704 | 211749 + infoscience.tind.io | 60166 | 207299 + edoc.mpg.de | 0 | 205252 + erudit.org | 168490 | 197803 + delibra.bg.polsl.pl | 38666 | 196652 + n/a | 0 | 193814 + aleph.bib-bvb.de | 4349 | 186666 + serval.unil.ch | 41643 | 186372 + orbi.ulg.ac.be | 2400 | 184551 + digitalcommons.unl.edu | 144025 | 184372 + bib-pubdb1.desy.de | 33525 | 182717 + (40 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 50; + + oai_prefix | status | count + ---------------------------+---------------+--------- + repec | success | 1133175 + repec | no-pdf-link | 638105 + hal | success | 573218 + cds.cern.ch | no-capture | 540380 + repec | redirect-loop | 516451 + juser.fz-juelich.de | no-pdf-link | 477881 + americanae.aecid.es | no-pdf-link | 417766 + hrcak.srce.hr | success | 404085 + www.irgrid.ac.cn | no-pdf-link | 370908 + hal | no-pdf-link | 359252 + www.irgrid.ac.cn | no-capture | 355532 + espace.library.uq.edu.au | no-pdf-link | 320479 + igi.indrastra.com | no-pdf-link | 318242 + repec | no-capture | 316981 + invenio.nusl.cz | no-pdf-link | 309802 + rour.neicon.ru | redirect-loop | 300911 + hypotheses.org | no-pdf-link | 300251 + renati.sunedu.gob.pe | no-capture | 282800 + t2r2.star.titech.ac.jp | no-pdf-link | 272045 + generic.eprints.org | success | 263566 + quod.lib.umich.edu | no-pdf-link | 259661 + archive.ugent.be | no-capture | 256127 + evastar-karlsruhe.de | no-pdf-link | 248939 + zir.nsk.hr | link-loop | 226919 + repository.ust.hk | no-pdf-link | 208569 + edoc.mpg.de | no-pdf-link | 199758 + bibliotecadigital.jcyl.es | no-pdf-link | 188433 + orbi.ulg.ac.be | no-pdf-link | 172373 + diva.org | no-capture | 171115 + lup.lub.lu.se | no-pdf-link | 168652 + erudit.org | success | 168490 + ojs.pkp.sfu.ca | success | 168029 + lib.dr.iastate.edu | success | 158494 + zir.nsk.hr | success | 156753 + digital.kenyon.edu | success | 154900 + revues.org | success | 151156 + books.openedition.org | no-pdf-link | 149607 + freidok.uni-freiburg.de | no-pdf-link | 146837 + digitalcommons.unl.edu | success | 144025 + escholarship.org/ark | success | 140835 + culeuclid | link-loop | 140291 + edpsciences.org | success | 139495 + serval.unil.ch | no-pdf-link | 138644 + bib-pubdb1.desy.de | no-pdf-link | 133815 + krm.or.kr | no-pdf-link | 132461 + pure.atira.dk | no-pdf-link | 132179 + oai-gms.dimdi.de | redirect-loop | 131409 + aleph.bib-bvb.de | no-capture | 128261 + library.wur.nl | no-pdf-link | 124718 + lirias2repo.kuleuven.be | no-capture | 123106 + (50 rows) + +Note: could just delete the "excluded" rows? and not harvest them in the +future, and filter them at ingest time (in transform script). + + + +## Investigate no-pdf-link sandcrawler improvements + +Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works: + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%' + ORDER BY random() + LIMIT 10; + +Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works): + + \x auto + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + ORDER BY random() + LIMIT 30; + +### repec (SKIP-PREFIX) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35 +base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html +terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647 +base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf +terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75 +base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec +terminal_url | https://www.jstor.org/stable/1884373 + +Huh! This is just a catalog of other domains. Should probably skip + +DONE: skip/filter repec + +### juser.fz-juelich.de (SCOPE) + +-[ RECORD 1 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:132217 +base_url | http://juser.fz-juelich.de/record/132217 +terminal_url | http://juser.fz-juelich.de/record/132217 + +Poster; no files. + +-[ RECORD 2 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:268598 +base_url | http://juser.fz-juelich.de/record/268598 +terminal_url | http://juser.fz-juelich.de/record/268598 + +Journal. + +-[ RECORD 3 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:126613 +base_url | http://juser.fz-juelich.de/record/126613 +terminal_url | http://juser.fz-juelich.de/record/126613 + +-[ RECORD 4 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:67362 +base_url | http://juser.fz-juelich.de/record/67362 +terminal_url | http://juser.fz-juelich.de/record/67362 +-[ RECORD 5 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:869189 +base_url | http://juser.fz-juelich.de/record/869189 +terminal_url | http://juser.fz-juelich.de/record/869189 +-[ RECORD 6 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:810746 +base_url | http://juser.fz-juelich.de/record/810746 +terminal_url | http://juser.fz-juelich.de/record/810746 +-[ RECORD 7 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:52897 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +-[ RECORD 8 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:114755 +base_url | http://juser.fz-juelich.de/record/114755 +terminal_url | http://juser.fz-juelich.de/record/114755 +-[ RECORD 9 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:58025 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 + +The search URLs seem redundant? Not going to try to handle those. + +"Powered by Invenio v1.1.7" + +All of these examples seem to be not papers. Maybe we can filter these better +at the harvest or transform stage? + +### americanae.aecid.es (MIXED) + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:502896 +base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai +terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai + +just a metadata record? links to redalyc + +METADATA-ONLY + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:534600 +base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:524567 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 + +NOT-FOUND (404) + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:378914 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 + +Some single-page image archival thing? bespoke, skipping. + +SKIP-BESPOKE + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:526142 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 + +NOT-FOUND (404) + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:373408 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 + +NOT-FOUND (404) + +### www.irgrid.ac.cn (SKIP-PREFIX) + +Chinese Academy of Sciences Institutional Repositories Grid + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1749980 +base_url | http://www.irgrid.ac.cn/handle/1471x/1749980 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980 + +Can't access + +FORBIDDEN + +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/857397 +base_url | http://www.irgrid.ac.cn/handle/1471x/857397 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397 + +Just linking to another IR; skip it. + +http://ir.ipe.ac.cn/handle/122111/10608 + +requires login + +DONE: '/password-login;jsessionid' as a loginwall URL pattern + http://ir.ipe.ac.cn/handle/122111/10608 + http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf + +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1060447 +base_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1671377 +base_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1178430 +base_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2488017 +base_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/977147 +base_url | http://www.irgrid.ac.cn/handle/1471x/977147 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2454503 +base_url | http://ir.nwipb.ac.cn/handle/363003/9957 +terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957 + +this domain is a disapointment :( + +should continue crawling, as the metadata is open and good. but won't get fulltext? + +### hal (FIXED-PARTIAL) + +-[ RECORD 1 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00744951v1 +base_url | https://hal.archives-ouvertes.fr/hal-00744951 +terminal_url | https://hal.archives-ouvertes.fr/hal-00744951 + +Off-site OA link. + +FIXED-HAL + +-[ RECORD 2 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-01065398v1 +base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf +terminal_url | https://hal.archives-ouvertes.fr/index/index +-[ RECORD 3 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:lirmm-00371599v1 +base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 +terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 + +To elsevier :( + +-[ RECORD 4 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00284780v1 +base_url | https://hal.archives-ouvertes.fr/hal-00284780 +terminal_url | https://hal.archives-ouvertes.fr/hal-00284780 + +METADATA-ONLY + +-[ RECORD 5 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00186151v1 +base_url | https://hal.archives-ouvertes.fr/hal-00186151 +terminal_url | https://hal.archives-ouvertes.fr/hal-00186151 + +METADATA-ONLY + +-[ RECORD 6 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00399754v1 +base_url | https://hal.archives-ouvertes.fr/hal-00399754 +terminal_url | https://hal.archives-ouvertes.fr/hal-00399754 + +METADATA-ONLY + + +### espace.library.uq.edu.au (SKIP) + +-[ RECORD 1 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:136497 +base_url | https://espace.library.uq.edu.au/view/UQ:136497 +terminal_url | https://espace.library.uq.edu.au/view/UQ:136497 +-[ RECORD 2 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:411389 +base_url | https://espace.library.uq.edu.au/view/UQ:411389 +terminal_url | https://espace.library.uq.edu.au/view/UQ:411389 +-[ RECORD 3 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:401773 +base_url | https://espace.library.uq.edu.au/view/UQ:401773 +terminal_url | https://espace.library.uq.edu.au/view/UQ:401773 +-[ RECORD 4 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:675334 +base_url | https://espace.library.uq.edu.au/view/UQ:675334 +terminal_url | https://espace.library.uq.edu.au/view/UQ:675334 +-[ RECORD 5 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:312311 +base_url | https://espace.library.uq.edu.au/view/UQ:312311 +terminal_url | https://espace.library.uq.edu.au/view/UQ:312311 +-[ RECORD 6 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:209401 +base_url | https://espace.library.uq.edu.au/view/UQ:209401 +terminal_url | https://espace.library.uq.edu.au/view/UQ:209401 +-[ RECORD 7 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:327188 +base_url | https://espace.library.uq.edu.au/view/UQ:327188 +terminal_url | https://espace.library.uq.edu.au/view/UQ:327188 + +Very javascript heavy (skeletal HTML). And just links to fulltext on publisher +sites. + +### igi.indrastra.com (METADATA-ONLY) + +-[ RECORD 1 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:267221 +base_url | http://igi.indrastra.com/items/show/267221 +terminal_url | http://igi.indrastra.com/items/show/267221 +-[ RECORD 2 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:181799 +base_url | http://igi.indrastra.com/items/show/181799 +terminal_url | http://igi.indrastra.com/items/show/181799 +-[ RECORD 3 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:125382 +base_url | http://igi.indrastra.com/items/show/125382 +terminal_url | http://igi.indrastra.com/items/show/125382 +-[ RECORD 4 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:47266 +base_url | http://igi.indrastra.com/items/show/47266 +terminal_url | http://igi.indrastra.com/items/show/47266 +-[ RECORD 5 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:12872 +base_url | http://igi.indrastra.com/items/show/12872 +terminal_url | http://igi.indrastra.com/items/show/12872 +-[ RECORD 6 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:231620 +base_url | http://igi.indrastra.com/items/show/231620 +terminal_url | http://igi.indrastra.com/items/show/231620 + +"Proudly powered by Omeka" + +### invenio.nusl.cz (METADATA-ONLY) + + oai_id | base_url | terminal_url +----------------------------+------------------------------------+-------------------------------------- + oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409 + oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783 + oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961 + oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800 + oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695 + oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393 + oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987 + oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396 + oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512 + oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631 + +Metadata only (at least this set) + +### hypotheses.org + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:mittelalter/9529 +base_url | http://mittelalter.hypotheses.org/9529 +terminal_url | https://mittelalter.hypotheses.org/9529 +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/18638 +base_url | http://archivalia.hypotheses.org/18638 +terminal_url | https://archivalia.hypotheses.org/18638 +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/13614 +base_url | http://archivalia.hypotheses.org/13614 +terminal_url | https://archivalia.hypotheses.org/13614 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:teteschercheuses/2785 +base_url | http://teteschercheuses.hypotheses.org/2785 +terminal_url | https://teteschercheuses.hypotheses.org/2785 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:altervsego/608 +base_url | http://altervsego.hypotheses.org/608 +terminal_url | http://altervsego.hypotheses.org/608 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivewk1/21905 +base_url | http://archivewk1.hypotheses.org/21905 +terminal_url | https://archivewk1.hypotheses.org/21905 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:slkdiaspo/3321 +base_url | http://slkdiaspo.hypotheses.org/3321 +terminal_url | https://slkdiaspo.hypotheses.org/3321 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:diga/280 +base_url | http://diga.hypotheses.org/280 +terminal_url | https://diga.hypotheses.org/280 + +These are all a big mix... basically blogs. Should continue crawling, but expect no yield. + +### t2r2.star.titech.ac.jp (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00105099 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00101346 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50161100 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00232407 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50120040 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50321440 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50235666 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 + + +### quod.lib.umich.edu + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2 +base_url | http://name.umdl.umich.edu/acf2679.0015.003 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003 +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:b14970.0001.001 +base_url | http://name.umdl.umich.edu/B14970.0001.001 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001 +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3 +base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3 +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43 +base_url | http://name.umdl.umich.edu/acg2248.1-16.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9 +base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9 +base_url | http://name.umdl.umich.edu/acg1336.1-24.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006 +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a +base_url | http://name.umdl.umich.edu/africanamer.0002.32a +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a + +These are... issues of journals? Should continue to crawl, but not expect much. + +### evastar-karlsruhe.de (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:270011444 +base_url | https://publikationen.bibliothek.kit.edu/270011444 +terminal_url | https://publikationen.bibliothek.kit.edu/270011444 +-[ RECORD 2 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000050117 +base_url | https://publikationen.bibliothek.kit.edu/1000050117 +terminal_url | https://publikationen.bibliothek.kit.edu/1000050117 +-[ RECORD 3 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:362296 +base_url | https://publikationen.bibliothek.kit.edu/362296 +terminal_url | https://publikationen.bibliothek.kit.edu/362296 +-[ RECORD 4 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:23042000 +base_url | https://publikationen.bibliothek.kit.edu/23042000 +terminal_url | https://publikationen.bibliothek.kit.edu/23042000 +-[ RECORD 5 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000069945 +base_url | https://publikationen.bibliothek.kit.edu/1000069945 +terminal_url | https://publikationen.bibliothek.kit.edu/1000069945 + + +### repository.ust.hk + +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-67233 +base_url | http://repository.ust.hk/ir/Record/1783.1-67233 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-63232 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017 +terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-2891 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103 +terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-56231 +base_url | http://repository.ust.hk/ir/Record/1783.1-56231 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231 + +[...] + +-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-24872 +base_url | http://repository.ust.hk/ir/Record/1783.1-24872 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872 +-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-3457 +base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-73215 +base_url | http://repository.ust.hk/ir/Record/1783.1-73215 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215 + +DONE: gateway.isiknowledge.com is bogus/blocking? + + +### edoc.mpg.de (SKIP-DEPRECATED) + + oai_id | base_url | terminal_url +------------------------+---------------------------+--------------------------- + oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650 + oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195 + oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655 + oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179 + oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141 + oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412 + oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531 + oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047 + oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650 + oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852 + +This whole instance seems to have been replaced + +### bibliotecadigital.jcyl.es (SKIP-DIGITIZED) + +-[ RECORD 1 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000039962 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +-[ RECORD 2 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14075 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +-[ RECORD 3 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:4842 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +-[ RECORD 4 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14799 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +-[ RECORD 5 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:821 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 + +Digitized images as pages; too much to deal with for now. + +### orbi.ulg.ac.be + +-[ RECORD 1 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/128079 +base_url | https://orbi.uliege.be/handle/2268/128079 +terminal_url | https://orbi.uliege.be/handle/2268/128079 +-[ RECORD 2 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/67659 +base_url | https://orbi.uliege.be/handle/2268/67659 +terminal_url | https://orbi.uliege.be/handle/2268/67659 +-[ RECORD 3 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/35521 +base_url | https://orbi.uliege.be/handle/2268/35521 +terminal_url | https://orbi.uliege.be/handle/2268/35521 +-[ RECORD 4 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/107922 +base_url | https://orbi.uliege.be/handle/2268/107922 +terminal_url | https://orbi.uliege.be/handle/2268/107922 +-[ RECORD 5 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/215694 +base_url | https://orbi.uliege.be/handle/2268/215694 +terminal_url | https://orbi.uliege.be/handle/2268/215694 + +Described below. + +### library.wur.nl (FIXED-BESPOKE) + + oai_id | base_url | terminal_url + -----------------------------------+------------------------------------------------+------------------------------------------------ + oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 + oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 + oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 + oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 + oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 + oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 + oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 + oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 + oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 + oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 + (10 rows) + +Seems like a one-off site? But added a pattern. + +### pure.atira.dk + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38 +base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694 +terminal_url | https://www.tandfonline.com/action/cookieAbsent +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html + +Metadata only + +DONE: /cookieAbsent is cookie block + https://www.tandfonline.com/action/cookieAbsent + +### bib-pubdb1.desy.de (FIXED-INVENIO) + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:96756 +base_url | http://bib-pubdb1.desy.de/record/96756 +terminal_url | http://bib-pubdb1.desy.de/record/96756 + +Metadata only. + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:416556 +base_url | http://bib-pubdb1.desy.de/record/416556 +terminal_url | http://bib-pubdb1.desy.de/record/416556 + +Fixed! + +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:414545 +base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:170169 +base_url | http://bib-pubdb1.desy.de/record/170169 +terminal_url | http://bib-pubdb1.desy.de/record/170169 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:191154 +base_url | http://bib-pubdb1.desy.de/record/191154 +terminal_url | http://bib-pubdb1.desy.de/record/191154 + +Metadata only + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:155092 +base_url | http://bib-pubdb1.desy.de/record/155092 +terminal_url | http://bib-pubdb1.desy.de/record/155092 + +Fixed! + +-[ RECORD 8 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:97158 +base_url | http://bib-pubdb1.desy.de/record/97158 +terminal_url | http://bib-pubdb1.desy.de/record/97158 + +Metadata only + +"Powered by Invenio v1.1.7" + +Can/should skip the "search" URLs + +### serval.unil.ch + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_60346fc75171 +base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_4db47fc4b593 +base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_57aac24fe115 +base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_deabae6baf6c +base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_a5ec0df1370f +base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_080300c2e23c +base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_de777dd2b07f +base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F +-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_5e824e244c27 +base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27 + +Metadata only? See elsewhere. + +### Random Links + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dbc.wroc.pl:41031 +base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 +terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 + +This is some platform/package thing. PDF is in an iframe. Platform is "DLibra". +FIXED-DLIBRA + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/174291 +base_url | https://orbi.uliege.be/handle/2268/174291 +terminal_url | https://orbi.uliege.be/handle/2268/174291 + +DSpace platform. There are multiple files, and little to "select" on. + +https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with + +PARTIAL-DSPACE + +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.tue.nl:664163 +base_url | http://repository.tue.nl/664163 +terminal_url | http://repository.tue.nl/664163 + +Ah, this is the Pure platform from Elsevier. +Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance + +FIXED-PURE + + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:juser.fz-juelich.de:49579 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 + +(handled above) + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/97937 +base_url | https://orcid.org/0000-0002-2066-2082 +terminal_url | https://orcid.org/0000-0002-2066-2082 + +ORCID! Skip it. + +DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time. + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:edoc.mpg.de:360269 +base_url | http://edoc.mpg.de/360269 +terminal_url | http://edoc.mpg.de/360269 + +Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure? + +DONE: edoc.mpg.de -> pure.mpg.de + +-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:books.openedition.org:msha/17716 +base_url | http://books.openedition.org/msha/17716 +terminal_url | https://books.openedition.org/msha/17716 + +Open edition is free to read HTML, but not PDF (or epub, etc). + +TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest) + +HTML-WORKED + +-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epub.oeaw.ac.at:0x003aba48 +base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf +terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf + +requires login + +FORBIDDEN + +-[ RECORD 9 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/88986 +base_url | https://orcid.org/0000-0002-4147-2560 +terminal_url | https://orcid.org/0000-0002-4147-2560 + +DONE: skip orcids + +-[ RECORD 10 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-28786 +base_url | http://repository.ust.hk/ir/Record/1783.1-28786 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786 + +Generator: VuFind 5.1.1 +just a metadata record + +METADATA-ONLY + +-[ RECORD 11 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:rcin.org.pl:50797 +base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 +terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 + +Seems like a software platform? not sure. + +METADATA-ONLY + +-[ RECORD 12 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dea.lib.unideb.hu:2437/69641 +base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 +terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 + +-[ RECORD 13 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871 +base_url | http://handle.unsw.edu.au/1959.4/64871 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L + +-[ RECORD 14 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:www.wbc.poznan.pl:225930 +base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 +terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 + +SOFT-404 + +-[ RECORD 15 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.erciyes.edu.tr:105 +base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105 +terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105 + +GONE (domain not registered) + +-[ RECORD 16 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:37500 +base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 +terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 + +Seems like a bespoke site + +SKIP-BESPOKE + +-[ RECORD 17 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50401364 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 + +METADATA-ONLY + +-[ RECORD 18 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epubs.cclrc.ac.uk:work/4714 +base_url | http://purl.org/net/epubs/work/4714 +terminal_url | https://epubs.stfc.ac.uk/work/4714 + +It's got a purl! haha. + +METADATA-ONLY + +------ + +Another batch! With some repeat domains removed. + +-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc +base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc +terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov + +SKIP + +-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:etd.adm.unipi.it:etd-05302014-183910 +base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/ + +Some software platform? Pretty basic/bespoke + +FIXED-PARTIAL + +-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000098246 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 + +SKIP (see elsewhere) + +-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:elektra.cdaea.es:documento.29259 +base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 +terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 + +Photo. + +SKIP-SCOPE + +-[ RECORD 9 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829 +base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L + +METADATA-ONLY + +-[ RECORD 12 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a +base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html +terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html + +unsure + +-[ RECORD 16 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.wur.nl:wurpubs/369344 +base_url | https://library.wur.nl/WebQuery/wurpubs/369344 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344 + +this specific record not OA (but site is fine/fixed) + +-[ RECORD 17 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:escholarship.umassmed.edu:oapubs-2146 +base_url | https://escholarship.umassmed.edu/oapubs/1147 +terminal_url | http://escholarship.umassmed.edu/oapubs/1147/ + +just links to publisher (no content in repo) + +-[ RECORD 18 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010 +base_url | https://digitalcommons.usu.edu/wild_facpub/11 +terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/ + +also just links to publisher (no content in repo) + +-[ RECORD 25 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:igi.indrastra.com:306768 +base_url | http://igi.indrastra.com/items/show/306768 +terminal_url | http://igi.indrastra.com/items/show/306768 + +(see elsewhere) + +-[ RECORD 26 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:fau.digital.flvc.org:fau_9804 +base_url | http://purl.flvc.org/fcla/dt/12932 +terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804 + +Islandora. + +-[ RECORD 27 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.lu.lv:7/16019 +base_url | https://dspace.lu.lv/dspace/handle/7/16019 +terminal_url | https://dspace.lu.lv/dspace/handle/7/16019 + +LOGINWALL + +-[ RECORD 28 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:zir.nsk.hr:umas_218 +base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 +terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 + +REMOVED + + +-[ RECORD 29 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:36390 +base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 +terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 + +Book, with chapters, not an individual work. + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:krm.or.kr:10056135m201r +base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y +terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135 + +research results repository; keep crawling + +SKIP-SCOPE + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:www.db-thueringen.de:dbt_mods_00005191 +base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 +terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 + +powered by "MyCoRe" + +FIXED-MYCORE + +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405 +base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 +terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 + +seems to be a general purpose regional library? not research-specific + +SKIP-UNSURE + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:etd.adm.unipi.it:etd-02272019-123644 +base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/ + +This specific URL is not available (FORBIDDEN) + +others have multiple files, not just a single PDF: +https://etd.adm.unipi.it/t/etd-09102013-124430/ + +SKIP-UNSURE + +-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:commons.ln.edu.hk:sw_master-5408 +base_url | https://commons.ln.edu.hk/sw_master/4408 +terminal_url | https://commons.ln.edu.hk/sw_master/4408/ + +worth crawling I guess + +METADATA-ONLY + +-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:mouseion.jax.org:ssbb1976-1224 +base_url | https://mouseion.jax.org/ssbb1976/225 +terminal_url | https://mouseion.jax.org/ssbb1976/225/ + +METADATA-ONLY + +-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aleph.bib-bvb.de:bvb01-016604343 +base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer +terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true + +SOFT-404 / FORBIDDEN (cookie timeout) + +-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bivaldi.gva.es:11740 +base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 +terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 + + +-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:library.wur.nl:wurpubs/443282 +base_url | https://library.wur.nl/WebQuery/wurpubs/443282 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282 + +DIGIBIS platform (like some others) + +FIXED-PARTIAL + +-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:hal:in2p3-00414135v1 +base_url | http://hal.in2p3.fr/in2p3-00414135 +terminal_url | http://hal.in2p3.fr:80/in2p3-00414135 + +METADATA-ONLY + +-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aaltodoc.aalto.fi:123456789/13201 +base_url | https://aaltodoc.aalto.fi/handle/123456789/13201 +terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201 + +This specific record is not accessible. +Another: https://aaltodoc.aalto.fi/handle/123456789/38002 + +DSpace 5.4 + +Worked (from recent changes) + + +-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:sedici.unlp.edu.ar:10915/40144 +base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view +terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view + +This is a journal! Cool. Plone software platform. + +FIXED + +## Top no-capture Domains + +Top terminal no-capture domains: + + SELECT domain, COUNT(domain) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_file_result.status = 'no-capture' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 30; + + domain | count + -----------------------------------+------- + digitalrepository.unm.edu | 94087 + escholarship.org | 80632 + ir.opt.ac.cn | 70504 + idus.us.es | 67908 + www.cambridge.org | 56376 + www.ssoar.info | 52534 + rep.bntu.by | 52127 + scholarworks.umt.edu | 48546 + publikationen.ub.uni-frankfurt.de | 46987 + dk.um.si | 45753 + repositorio.uladech.edu.pe | 37028 + uu.diva-portal.org | 34929 + digitalcommons.law.byu.edu | 31732 + sedici.unlp.edu.ar | 31233 + elib.sfu-kras.ru | 29131 + jyx.jyu.fi | 28144 + www.repository.cam.ac.uk | 27728 + nagoya.repo.nii.ac.jp | 26673 + www.duo.uio.no | 25258 + www.persee.fr | 24968 + www2.senado.leg.br | 24426 + tesis.ucsm.edu.pe | 24049 + digitalcommons.unl.edu | 21974 + www.degruyter.com | 21940 + www.igi-global.com | 20736 + thekeep.eiu.edu | 20712 + docs.lib.purdue.edu | 20538 + repositorio.cepal.org | 20280 + elib.bsu.by | 19620 + minds.wisconsin.edu | 19473 + (30 rows) + +These all seem worth crawling. A couple publishers (cambridge.org), and +persee.fr will probably fail, but not too many URLs. + +## Summary of Filtered Prefixes and Domains (OAI-PMH) + +oai:kb.dk: + too large and generic +oai:bdr.oai.bsb-muenchen.de: + too large and generic +oai:hispana.mcu.es: + too large and generic +oai:bnf.fr: + too large and generic +oai:ukm.si: + too large and generic +oai:biodiversitylibrary.org: + redundant with other ingest and archive.org content +oai:hsp.org: + large; historical content only +oai:repec: + large; mostly (entirely?) links to publisher sites +oai:n/a: + meta? +oai:quod.lib.umich.edu: + entire issues? hard to crawl so skip for now +oai:hypotheses.org: + HTML, not PDF +oai:americanae.aecid.es: + large, complex. skip for now +oai:www.irgrid.ac.cn: + aggregator of other IRs +oai:espace.library.uq.edu.au: + large; metadata only; javascript heavy (poor heritrix crawling) +oai:edoc.mpg.de: + deprecated domain, with no redirects +oai:bibliotecadigital.jcyl.es: + digitized historical docs; hard to crawl, skip for now +oai:repository.erciyes.edu.tr: + gone (domain lapsed) +oai:krm.or.kr: + "research results repository" (metadata only) + +www.kb.dk + large, general purpose, scope +kb-images.kb.dk + deprecated +mdz-nbn-resolving.de + multiple prefixes end up here. historical docs, scope +aggr.ukm.um.si + large, out of scope +edoc.mpg.de + deprecated domain +doaj.org + index (metadata only) +orcid.org + out of scope +gateway.isiknowledge.com + clarivate login/payall (skipping in ingest) + +Needs filtering to a subset of records (by 'set' or other filtering?): + +oai:igi.indrastra.com: +oai:invenio.nusl.cz: +oai:t2r2.star.titech.ac.jp: +oai:evastar-karlsruhe.de: +oai:repository.ust.hk: +oai:serval.unil.ch: +oai:pure.atira.dk: + +FIlters in SQL syntax: + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + +and in some contexts (PDFs; switch to HTML): + + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + +## Overall Summary of OAI-PMH Stuff + +Big picture is that the majority of `no-pdf-link` crawl status are because of +repository scope, record scope, or content format issues. That being said, +there was a sizable fraction of sites which were platforms (like DSpace) which +were not ingesting well. + +A significant fraction of records are "metadata only" (of papers), or non-paper +entity types (like persons, grants, or journal titles), and a growing fraction +(?) are metadata plus link to OA publisher fulltext (offsite). Might be +possible to detect these at ingest time, or earlier at OAI-PMH +harvest/transform time and filter them out. + +It may be worthwhile to attempt ingest of multiple existing captures +(timestamps) in the ingest pipeline. Eg, isntead of chosing a single "best" +capture, if therea are multiple HTTP 200 status captures, try ingest with each +(or at least a couple). This is because repository software gets upgraded, so +old "no-capture" or "not found" or "link loop" type captures may work when +recrawled. + +New summary with additional filters: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 12872279 + no-pdf-link | 9329602 + no-capture | 4696362 + redirect-loop | 1541458 + terminal-bad-status | 660418 + link-loop | 452831 + wrong-mimetype | 434868 + null-body | 71065 + cdx-error | 17005 + | 15275 + petabox-error | 12743 + wayback-error | 11759 + skip-url-blocklist | 182 + gateway-timeout | 122 + redirects-exceeded | 120 + bad-redirect | 117 + bad-gzip-encoding | 111 + wayback-content-error | 102 + timeout | 72 + blocked-cookie | 62 + (20 rows) + diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md new file mode 100644 index 0000000..a0bb0c5 --- /dev/null +++ b/notes/ingest/2021-09-03_daily_improvements.md @@ -0,0 +1,1021 @@ + +Periodic check-in of daily crawling/ingest. + +Overall ingest status, past 30 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------+-------- + pdf | no-pdf-link | 158474 + pdf | spn2-cdx-lookup-failure | 135344 + pdf | success | 127938 + pdf | spn2-error | 65411 + pdf | gateway-timeout | 63112 + pdf | blocked-cookie | 26338 + pdf | terminal-bad-status | 24853 + pdf | link-loop | 15699 + pdf | spn2-error:job-failed | 13862 + pdf | redirect-loop | 11432 + pdf | cdx-error | 2376 + pdf | too-many-redirects | 2186 + pdf | wrong-mimetype | 2142 + pdf | forbidden | 1758 + pdf | spn2-error:no-status | 972 + pdf | not-found | 820 + pdf | bad-redirect | 536 + pdf | read-timeout | 392 + pdf | wayback-error | 251 + pdf | remote-server-error | 220 + (20 rows) + +Hrm, that is a healthy fraction of `no-pdf-link`. + +Broken domains, past 30 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 39678 + osf.io | gateway-timeout | 29809 + acervus.unicamp.br | no-pdf-link | 21978 + osf.io | terminal-bad-status | 18727 + zenodo.org | spn2-cdx-lookup-failure | 17008 + doi.org | spn2-cdx-lookup-failure | 15503 + www.degruyter.com | no-pdf-link | 15122 + ieeexplore.ieee.org | spn2-error:job-failed | 12921 + osf.io | spn2-cdx-lookup-failure | 11123 + www.tandfonline.com | blocked-cookie | 8096 + www.morressier.com | no-pdf-link | 4655 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580 + pubs.acs.org | blocked-cookie | 4415 + www.frontiersin.org | no-pdf-link | 4163 + www.degruyter.com | spn2-cdx-lookup-failure | 3788 + www.taylorfrancis.com | no-pdf-link | 3568 + www.sciencedirect.com | no-pdf-link | 3128 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116 + acervus.unicamp.br | spn2-cdx-lookup-failure | 2797 + www.mdpi.com | spn2-cdx-lookup-failure | 2719 + brill.com | link-loop | 2681 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657 + www.sciencedirect.com | spn2-cdx-lookup-failure | 2546 + apps.crossref.org | no-pdf-link | 2537 + onlinelibrary.wiley.com | blocked-cookie | 2528 + (25 rows) + +Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status != 'spn2-cdx-lookup-failure' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + + + domain | status | count + -----------------------------------------------------------------+-----------------------+-------- + academic.oup.com | | 2405 + academic.oup.com | no-pdf-link | 1240 + academic.oup.com | link-loop | 1010 + acervus.unicamp.br | | 21980 + acervus.unicamp.br | no-pdf-link | 21978 ** + aclanthology.org | | 208 + acp.copernicus.org | | 365 + acp.copernicus.org | success | 356 + aip.scitation.org | | 1071 + aip.scitation.org | blocked-cookie | 843 + aip.scitation.org | redirect-loop | 227 + apps.crossref.org | | 2537 + apps.crossref.org | no-pdf-link | 2537 + arxiv.org | | 17817 + arxiv.org | success | 17370 + arxiv.org | terminal-bad-status | 320 + asmedigitalcollection.asme.org | | 401 + asmedigitalcollection.asme.org | link-loop | 364 + assets.researchsquare.com | | 3706 + assets.researchsquare.com | success | 3706 + avmj.journals.ekb.eg | | 605 + avmj.journals.ekb.eg | success | 595 + bfa.journals.ekb.eg | | 224 + bfa.journals.ekb.eg | success | 214 + biorxiv.org | redirect-loop | 895 + biorxiv.org | | 895 + birdsoftheworld.org | | 286 + birdsoftheworld.org | no-pdf-link | 285 + bmjopen.bmj.com | success | 232 + bmjopen.bmj.com | | 232 + books.openedition.org | | 396 + books.openedition.org | no-pdf-link | 396 + brill.com | | 4272 + brill.com | link-loop | 2681 + brill.com | no-pdf-link | 1410 + cas.columbia.edu | | 1038 + cas.columbia.edu | no-pdf-link | 1038 ** + cdr.lib.unc.edu | | 513 + cdr.lib.unc.edu | success | 469 + chemrxiv.org | | 278 + chemrxiv.org | success | 275 + classiques-garnier.com | | 531 + classiques-garnier.com | no-pdf-link | 487 * + content.iospress.com | | 275 + content.iospress.com | link-loop | 230 + cris.maastrichtuniversity.nl | | 318 + cris.maastrichtuniversity.nl | success | 284 + cyberleninka.ru | | 1165 + cyberleninka.ru | success | 1134 + deepblue.lib.umich.edu | | 289 + dergipark.org.tr | | 1185 + dergipark.org.tr | success | 774 + dergipark.org.tr | no-pdf-link | 320 + didaktorika.gr | | 688 + didaktorika.gr | redirect-loop | 688 + digi.ub.uni-heidelberg.de | | 292 + digi.ub.uni-heidelberg.de | no-pdf-link | 292 + direct.mit.edu | | 236 + direct.mit.edu | no-pdf-link | 207 * + dl.acm.org | | 2319 + dl.acm.org | blocked-cookie | 2230 + dmtcs.episciences.org | | 733 + dmtcs.episciences.org | success | 730 + doi.ala.org.au | no-pdf-link | 2373 ** + doi.ala.org.au | | 2373 + doi.org | | 732 + doi.org | terminal-bad-status | 673 + downloads.hindawi.com | success | 1452 + downloads.hindawi.com | | 1452 + drive.google.com | | 216 + drive.google.com | no-pdf-link | 211 + dtb.bmj.com | | 674 + dtb.bmj.com | link-loop | 669 + easy.dans.knaw.nl | no-pdf-link | 261 * + easy.dans.knaw.nl | | 261 + ebooks.marilia.unesp.br | | 688 + ebooks.marilia.unesp.br | no-pdf-link | 688 * + ehp.niehs.nih.gov | | 766 + ehp.niehs.nih.gov | blocked-cookie | 765 + ejournal.mandalanursa.org | | 307 + ejournal.mandalanursa.org | success | 305 + elib.spbstu.ru | | 264 + elib.spbstu.ru | redirect-loop | 257 + elibrary.ru | | 1367 + elibrary.ru | redirect-loop | 1169 + elibrary.vdi-verlag.de | | 1251 + elibrary.vdi-verlag.de | no-pdf-link | 646 + elibrary.vdi-verlag.de | link-loop | 537 + elifesciences.org | | 328 + elifesciences.org | success | 323 + figshare.com | | 803 + figshare.com | no-pdf-link | 714 * + files.osf.io | | 745 + files.osf.io | success | 614 + hammer.purdue.edu | | 244 + hammer.purdue.edu | no-pdf-link | 243 + heiup.uni-heidelberg.de | | 277 + heiup.uni-heidelberg.de | no-pdf-link | 268 + hkvalidate.perfdrive.com | no-pdf-link | 370 * + hkvalidate.perfdrive.com | | 370 + ieeexplore.ieee.org | | 16675 + ieeexplore.ieee.org | spn2-error:job-failed | 12927 + ieeexplore.ieee.org | success | 1952 + ieeexplore.ieee.org | too-many-redirects | 1193 + ieeexplore.ieee.org | no-pdf-link | 419 + jamanetwork.com | | 339 + jamanetwork.com | success | 216 + jmstt.ntou.edu.tw | | 244 + jmstt.ntou.edu.tw | success | 241 + journal.ipb.ac.id | | 229 + journal.ipb.ac.id | success | 206 + journal.nafe.org | | 221 + journals.aps.org | | 614 + journals.aps.org | gateway-timeout | 495 + journals.asm.org | | 463 + journals.asm.org | blocked-cookie | 435 + journals.flvc.org | | 230 + journals.lww.com | | 1300 + journals.lww.com | link-loop | 1284 + journals.openedition.org | | 543 + journals.openedition.org | success | 311 + journals.ub.uni-heidelberg.de | | 357 + journals.ub.uni-heidelberg.de | success | 311 + jov.arvojournals.org | | 431 + jov.arvojournals.org | no-pdf-link | 422 * + kiss.kstudy.com | | 303 + kiss.kstudy.com | no-pdf-link | 303 * + library.iated.org | | 364 + library.iated.org | redirect-loop | 264 + library.seg.org | blocked-cookie | 301 + library.seg.org | | 301 + link.aps.org | redirect-loop | 442 + link.aps.org | | 442 + linkinghub.elsevier.com | | 515 + linkinghub.elsevier.com | gateway-timeout | 392 + mc.sbm.org.br | | 224 + mc.sbm.org.br | success | 224 + mdpi-res.com | | 742 + mdpi-res.com | success | 742 + mdsoar.org | | 220 + mediarep.org | | 269 + mediarep.org | success | 264 + medrxiv.org | redirect-loop | 290 + medrxiv.org | | 290 + muse.jhu.edu | | 429 + muse.jhu.edu | terminal-bad-status | 391 + mvmj.journals.ekb.eg | | 306 + oapub.org | | 292 + oapub.org | success | 289 + onepetro.org | | 426 + onepetro.org | link-loop | 406 + onlinelibrary.wiley.com | | 2835 + onlinelibrary.wiley.com | blocked-cookie | 2531 + onlinelibrary.wiley.com | redirect-loop | 264 + open.library.ubc.ca | | 569 + open.library.ubc.ca | no-pdf-link | 425 * + opendata.uni-halle.de | | 407 + opendata.uni-halle.de | success | 263 + osf.io | | 49022 + osf.io | gateway-timeout | 29810 + osf.io | terminal-bad-status | 18731 + osf.io | spn2-error | 247 + osf.io | not-found | 205 + oxford.universitypressscholarship.com | | 392 + oxford.universitypressscholarship.com | link-loop | 233 + panor.ru | no-pdf-link | 433 * + panor.ru | | 433 + papers.ssrn.com | | 1630 + papers.ssrn.com | link-loop | 1598 + pdf.sciencedirectassets.com | | 3063 + pdf.sciencedirectassets.com | success | 3063 + peerj.com | | 464 + peerj.com | no-pdf-link | 303 * + periodicos.ufpe.br | | 245 + periodicos.ufpe.br | success | 232 + periodicos.unb.br | | 230 + periodicos.unb.br | success | 221 + preprints.jmir.org | | 548 + preprints.jmir.org | cdx-error | 499 + publications.rwth-aachen.de | | 213 + publikationen.bibliothek.kit.edu | | 346 + publikationen.bibliothek.kit.edu | success | 314 + publikationen.uni-tuebingen.de | | 623 + publikationen.uni-tuebingen.de | no-pdf-link | 522 * + publons.com | no-pdf-link | 934 * + publons.com | | 934 + pubs.acs.org | | 4507 + pubs.acs.org | blocked-cookie | 4406 + pubs.rsc.org | | 1638 + pubs.rsc.org | link-loop | 1054 + pubs.rsc.org | redirect-loop | 343 + pubs.rsc.org | success | 201 + repositorio.ufu.br | | 637 + repositorio.ufu.br | success | 607 + repository.dri.ie | | 1852 + repository.dri.ie | no-pdf-link | 1852 ** + repository.library.brown.edu | | 293 + repository.library.brown.edu | no-pdf-link | 291 * + res.mdpi.com | | 10367 + res.mdpi.com | success | 10360 + retrovirology.biomedcentral.com | | 230 + revistas.ufrj.br | | 284 + revistas.ufrj.br | success | 283 + revistas.uptc.edu.co | | 385 + revistas.uptc.edu.co | success | 344 + royalsocietypublishing.org | | 231 + rsdjournal.org | | 347 + rsdjournal.org | success | 343 + s3-ap-southeast-2.amazonaws.com | | 400 + s3-ap-southeast-2.amazonaws.com | success | 392 + s3-eu-west-1.amazonaws.com | | 2096 + s3-eu-west-1.amazonaws.com | success | 2091 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286 + s3.ca-central-1.amazonaws.com | | 202 + sage.figshare.com | | 242 + sage.figshare.com | no-pdf-link | 241 + sajeb.org | | 246 + sajeb.org | no-pdf-link | 243 + scholar.dkyobobook.co.kr | | 332 + scholar.dkyobobook.co.kr | no-pdf-link | 328 * + search.mandumah.com | | 735 + search.mandumah.com | redirect-loop | 726 + secure.jbs.elsevierhealth.com | | 1112 + secure.jbs.elsevierhealth.com | blocked-cookie | 1108 + stm.bookpi.org | no-pdf-link | 468 * + stm.bookpi.org | | 468 + storage.googleapis.com | | 1012 + storage.googleapis.com | success | 1012 + tandf.figshare.com | | 469 + tandf.figshare.com | no-pdf-link | 466 + teses.usp.br | | 739 + teses.usp.br | success | 730 + tidsskrift.dk | | 360 + tidsskrift.dk | success | 346 + tiedejaedistys.journal.fi | | 224 + tind-customer-agecon.s3.amazonaws.com | success | 332 + tind-customer-agecon.s3.amazonaws.com | | 332 + valep.vc.univie.ac.at | no-pdf-link | 280 + valep.vc.univie.ac.at | | 280 + watermark.silverchair.com | | 1729 + watermark.silverchair.com | success | 1719 + www.academia.edu | | 387 + www.academia.edu | no-pdf-link | 386 + www.ahajournals.org | | 430 + www.ahajournals.org | blocked-cookie | 413 + www.atenaeditora.com.br | | 572 + www.atenaeditora.com.br | terminal-bad-status | 513 + www.atlantis-press.com | success | 722 + www.atlantis-press.com | | 722 + www.aup-online.com | | 419 + www.aup-online.com | no-pdf-link | 419 * + www.beck-elibrary.de | | 269 + www.beck-elibrary.de | no-pdf-link | 268 * + www.biodiversitylibrary.org | no-pdf-link | 528 * + www.biodiversitylibrary.org | | 528 + www.bloomsburycollections.com | | 623 + www.bloomsburycollections.com | no-pdf-link | 605 * + www.cabi.org | | 2191 + www.cabi.org | no-pdf-link | 2186 * + www.cairn.info | | 1283 + www.cairn.info | no-pdf-link | 713 + www.cairn.info | link-loop | 345 + www.cambridge.org | | 4128 + www.cambridge.org | no-pdf-link | 1531 + www.cambridge.org | success | 1441 + www.cambridge.org | link-loop | 971 + www.cureus.com | no-pdf-link | 526 * + www.cureus.com | | 526 + www.dbpia.co.kr | | 637 + www.dbpia.co.kr | redirect-loop | 631 + www.deboni.he.com.br | | 382 + www.deboni.he.com.br | success | 381 + www.degruyter.com | | 17783 + www.degruyter.com | no-pdf-link | 15102 + www.degruyter.com | success | 2584 + www.dovepress.com | | 480 + www.dovepress.com | success | 472 + www.e-manuscripta.ch | | 1350 + www.e-manuscripta.ch | no-pdf-link | 1350 * + www.e-periodica.ch | | 1276 + www.e-periodica.ch | no-pdf-link | 1275 + www.e-rara.ch | | 202 + www.e-rara.ch | no-pdf-link | 202 + www.elgaronline.com | | 495 + www.elgaronline.com | link-loop | 290 + www.elibrary.ru | | 922 + www.elibrary.ru | no-pdf-link | 904 + www.emerald.com | | 2155 + www.emerald.com | no-pdf-link | 1936 * + www.emerald.com | success | 219 + www.eurekaselect.com | | 518 + www.eurekaselect.com | no-pdf-link | 516 * + www.frontiersin.org | | 4163 + www.frontiersin.org | no-pdf-link | 4162 ** + www.hanser-elibrary.com | | 444 + www.hanser-elibrary.com | blocked-cookie | 444 + www.hanspub.org | | 334 + www.hanspub.org | no-pdf-link | 314 + www.idunn.no | | 1736 + www.idunn.no | link-loop | 596 + www.idunn.no | success | 577 + www.idunn.no | no-pdf-link | 539 + www.igi-global.com | terminal-bad-status | 458 + www.igi-global.com | | 458 + www.ijcai.org | | 533 + www.ijcai.org | success | 532 + www.ijraset.com | success | 385 + www.ijraset.com | | 385 + www.inderscience.com | | 712 + www.inderscience.com | no-pdf-link | 605 * + www.ingentaconnect.com | | 456 + www.ingentaconnect.com | no-pdf-link | 413 * + www.internationaljournalssrg.org | | 305 + www.internationaljournalssrg.org | no-pdf-link | 305 * + www.isca-speech.org | | 2392 + www.isca-speech.org | no-pdf-link | 2391 ** + www.journals.uchicago.edu | | 228 + www.journals.uchicago.edu | blocked-cookie | 227 + www.jstage.jst.go.jp | | 1492 + www.jstage.jst.go.jp | success | 1185 + www.jstage.jst.go.jp | no-pdf-link | 289 + www.jstor.org | | 301 + www.jurology.com | | 887 + www.jurology.com | redirect-loop | 887 + www.karger.com | | 318 + www.liebertpub.com | | 507 + www.liebertpub.com | blocked-cookie | 496 + www.morressier.com | | 4781 + www.morressier.com | no-pdf-link | 4655 ** + www.ncl.ecu.edu | | 413 + www.ncl.ecu.edu | success | 413 + www.nomos-elibrary.de | | 526 + www.nomos-elibrary.de | no-pdf-link | 391 + www.oecd-ilibrary.org | no-pdf-link | 1170 ** + www.oecd-ilibrary.org | | 1170 + www.openagrar.de | no-pdf-link | 221 + www.openagrar.de | | 221 + www.osapublishing.org | | 900 + www.osapublishing.org | link-loop | 615 + www.osapublishing.org | no-pdf-link | 269 + www.osti.gov | | 630 + www.osti.gov | link-loop | 573 + www.oxfordlawtrove.com | no-pdf-link | 476 * + www.oxfordlawtrove.com | | 476 + www.pdcnet.org | | 298 + www.pdcnet.org | terminal-bad-status | 262 + www.pedocs.de | | 203 + www.pnas.org | | 222 + www.preprints.org | | 372 + www.preprints.org | success | 366 + www.repository.cam.ac.uk | | 801 + www.repository.cam.ac.uk | success | 359 + www.repository.cam.ac.uk | no-pdf-link | 239 + www.research-collection.ethz.ch | | 276 + www.research-collection.ethz.ch | terminal-bad-status | 274 + www.revistas.usp.br | | 207 + www.revistas.usp.br | success | 204 + www.rina.org.uk | no-pdf-link | 1009 ** + www.rina.org.uk | | 1009 + www.schweizerbart.de | no-pdf-link | 202 + www.schweizerbart.de | | 202 + www.scielo.br | | 544 + www.scielo.br | redirect-loop | 526 + www.sciencedirect.com | | 3901 + www.sciencedirect.com | no-pdf-link | 3127 ** + www.sciencedirect.com | link-loop | 701 + www.sciendo.com | | 384 + www.sciendo.com | success | 363 + www.sciengine.com | | 225 + www.scirp.org | | 209 + www.spandidos-publications.com | | 205 + www.tandfonline.com | | 8925 + www.tandfonline.com | blocked-cookie | 8099 + www.tandfonline.com | terminal-bad-status | 477 + www.tandfonline.com | redirect-loop | 322 + www.taylorfrancis.com | | 6119 + www.taylorfrancis.com | no-pdf-link | 3567 + www.taylorfrancis.com | link-loop | 2169 + www.taylorfrancis.com | terminal-bad-status | 353 + www.thieme-connect.de | | 1047 + www.thieme-connect.de | redirect-loop | 472 + www.thieme-connect.de | spn2-error:job-failed | 343 + www.tib.eu | | 206 + www.trp.org.in | | 311 + www.trp.org.in | success | 311 + www.un-ilibrary.org | no-pdf-link | 597 * + www.un-ilibrary.org | | 597 + www.vr-elibrary.de | | 775 + www.vr-elibrary.de | blocked-cookie | 774 + www.wjgnet.com | | 204 + www.wjgnet.com | no-pdf-link | 204 + www.worldscientific.com | | 974 + www.worldscientific.com | blocked-cookie | 971 + www.worldwidejournals.com | | 242 + www.worldwidejournals.com | no-pdf-link | 203 + www.wto-ilibrary.org | no-pdf-link | 295 + www.wto-ilibrary.org | | 295 + www.zora.uzh.ch | | 222 + zenodo.org | | 49460 + zenodo.org | no-pdf-link | 39721 + zenodo.org | success | 8954 + zenodo.org | wrong-mimetype | 562 + | | 445919 + | no-pdf-link | 168035 + | success | 140875 + | gateway-timeout | 31809 + | blocked-cookie | 26431 + | terminal-bad-status | 25625 + | link-loop | 19006 + | spn2-error:job-failed | 13962 + | redirect-loop | 12512 + | wrong-mimetype | 2302 + | spn2-error | 1689 + | too-many-redirects | 1203 + | bad-redirect | 732 + | cdx-error | 539 + | not-found | 420 + | spn2-error:no-status | 256 + (419 rows) + +Get random subsets by terminal domain: + + \x auto + SELECT + ingest_request.link_source_id AS link_source_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%' + ORDER BY random() + LIMIT 5; + +## acervus.unicamp.br + +Previously flagged as messy (2021-05_daily_improvements.md) + +## cas.columbia.edu + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-2ety-qm51 +base_url | https://doi.org/10.7916/d8-2ety-qm51 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-0zf6-d167 +base_url | https://doi.org/10.7916/d8-0zf6-d167 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-k6ha-sn43 +base_url | https://doi.org/10.7916/d8-k6ha-sn43 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-bj6t-eb07 +base_url | https://doi.org/10.7916/d8-bj6t-eb07 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-xjac-j502 +base_url | https://doi.org/10.7916/d8-xjac-j502 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback + +these are not public (loginwalls) + +DONE: '/login?TARGET=' as a login wall pattern + +## doi.ala.org.au + +Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md) + +NOTE: look at ingesting datasets + +## www.isca-speech.org + +-[ RECORD 1 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2014-84 +base_url | https://doi.org/10.21437/interspeech.2014-84 +terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html +-[ RECORD 2 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2004-319 +base_url | https://doi.org/10.21437/interspeech.2004-319 +terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html +-[ RECORD 3 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-372 +base_url | https://doi.org/10.21437/interspeech.2006-372 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html +-[ RECORD 4 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2015-588 +base_url | https://doi.org/10.21437/interspeech.2015-588 +terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html +-[ RECORD 5 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-468 +base_url | https://doi.org/10.21437/interspeech.2006-468 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html + +Bespoke site. Added rule to sandcrawler. + +NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?) + +## www.morressier.com + + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0002858v +base_url | https://doi.org/10.1115/1.0002858v +terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5 +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0003896v +base_url | https://doi.org/10.1115/1.0003896v +terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038 +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0004476v +base_url | https://doi.org/10.1115/1.0004476v +terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5 +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0001286v +base_url | https://doi.org/10.1115/1.0001286v +terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0000315v +base_url | https://doi.org/10.1115/1.0000315v +terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874 + +Many of these seem to be presentations, as both video and slides. PDFs seem broken though. + +NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data) + +## www.oecd-ilibrary.org + +Paywall (2021-05_daily_improvements.md) + +## www.rina.org.uk + +-[ RECORD 1 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.ws.2002.10 +base_url | https://doi.org/10.3940/rina.ws.2002.10 +terminal_url | https://www.rina.org.uk/showproducts.html?product=4116 +-[ RECORD 2 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.pass.2003.16 +base_url | https://doi.org/10.3940/rina.pass.2003.16 +terminal_url | https://www.rina.org.uk/showproducts.html?product=3566 +-[ RECORD 3 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin.2013.15 +base_url | https://doi.org/10.3940/rina.icsotin.2013.15 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8017 +-[ RECORD 4 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.wfa.2010.23 +base_url | https://doi.org/10.3940/rina.wfa.2010.23 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8177 +-[ RECORD 5 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin15.2015.01 +base_url | https://doi.org/10.3940/rina.icsotin15.2015.01 +terminal_url | https://www.rina.org.uk/showproducts.html?product=7883 + +Site is broken in some way + +## www.sciencedirect.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.jhlste.2021.100332 +base_url | https://doi.org/10.1016/j.jhlste.2021.100332 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.hazadv.2021.100006 +base_url | https://doi.org/10.1016/j.hazadv.2021.100006 +terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-12-822844-9.00009-8 +base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.colcom.2021.100490 +base_url | https://doi.org/10.1016/j.colcom.2021.100490 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-323-85245-6.00012-6 +base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126 + +These no-pdf-url ones seem to just be not OA, which is expected for much of the +domain. + +## repository.dri.ie + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+--------------------------------------------- + 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941 + 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f + 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102 + 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t + 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726 + +"Digital repository of Ireland" + +Historical scanned content. Bespoke site. Fixed. + +NOTE: recrawl/retry this domain + +## www.frontiersin.org + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/978-2-88971-147-5 +base_url | https://doi.org/10.3389/978-2-88971-147-5 +terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fnins.2021.722592 +base_url | https://doi.org/10.3389/fnins.2021.722592 +terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fcell.2021.683209 +base_url | https://doi.org/10.3389/fcell.2021.683209 +terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fmicb.2021.692474 +base_url | https://doi.org/10.3389/fmicb.2021.692474 +terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fneur.2021.676527 +base_url | https://doi.org/10.3389/fneur.2021.676527 +terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full + +All the `/research-topics/` URLs are out of scope. + +NOTE: recrawl missing frontiersin.org articles for PDFs +NOTE: recrawl missing frontiersin.org articles for XML (?) + +------- + +## direct.mit.edu + +Previously "not available" (2021-05_daily_improvements.md) + +## figshare.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15052236.v6 +base_url | https://doi.org/10.6084/m9.figshare.15052236.v6 +terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.14907846.v5 +base_url | https://doi.org/10.6084/m9.figshare.14907846.v5 +terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5 +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15157614.v1 +base_url | https://doi.org/10.6084/m9.figshare.15157614.v1 +terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15172926.v1 +base_url | https://doi.org/10.6084/m9.figshare.15172926.v1 +terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.16532574.v1 +base_url | https://doi.org/10.6084/m9.figshare.16532574.v1 +terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1 + +NOTE: can determine from the redirect URL, I guess. This is helpful for ingest! +Could also potentially correct fatcat release_type using this info. + +We seem to be getting the ones we can (eg, papers) just fine + +## hkvalidate.perfdrive.com + +Should be skipping/bailing on this domain, but not for some reason. + +-[ RECORD 1 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05cc +base_url | https://doi.org/10.3847/1538-4357/ac05cc +terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 2 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac0429 +base_url | https://doi.org/10.3847/1538-4357/ac0429 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 3 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1149/1945-7111/ac1a85 +base_url | https://doi.org/10.1149/1945-7111/ac1a85 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 4 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.35848/1882-0786/ac1b0d +base_url | https://doi.org/10.35848/1882-0786/ac1b0d +terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 5 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05ba +base_url | https://doi.org/10.3847/1538-4357/ac05ba +terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= + +Was failing to check against blocklist again at the end of attempts. + +Could retry all these to update status, but probably not worth it. + +## jov.arvojournals.org + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+------------------------------------------------------------- + 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021 + 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561 + 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057 + 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793 + 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441 + +These seem to just not be published/available yet. + +But they also use watermark.silverchair.com + +NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest +NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix + +## kiss.kstudy.com + +Previously unable to download (2021-05_daily_improvements.md) + +## open.library.ubc.ca + + link_source_id | base_url | terminal_url +--------------------+------------------------------------+---------------------------------------------------------------------------------- + 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664 + 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189 + 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994 + 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312 + +Historical newspapers, out of scope? + +Video content: +https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + +Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 + +NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 + + +## panor.ru + + link_source_id | base_url | terminal_url +-------------------------+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html + 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html + 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html + 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html + 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html + +"The full version of the article is available only to subscribers of the journal" + +Paywall + +## peerj.com + +Previously: this is HTML of reviews (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope + +## publons.com + +Previously: this is HTML (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope (length of works) + +## stm.bookpi.org + + link_source_id | base_url | terminal_url +-----------------------------+---------------------------------------------+---------------------------------------------------- + 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231 + 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096 + 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330 + 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810 + 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274 + +These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref + +## www.cabi.org + + link_source_id | base_url | terminal_url +--------------------------+------------------------------------------+---------------------------------------------------- + 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742 + 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471 + 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544 + 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117 + 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337 + +Reviews? but just abstracts? + +## www.cureus.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17547 +base_url | https://doi.org/10.7759/cureus.17547 +terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16867 +base_url | https://doi.org/10.7759/cureus.16867 +terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17425 +base_url | https://doi.org/10.7759/cureus.17425 +terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17313 +base_url | https://doi.org/10.7759/cureus.17313 +terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16943 +base_url | https://doi.org/10.7759/cureus.16943 +terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed + +Ugh, stupid "email to get PDF". but ingest seems to work anyways? + +NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar) + +## www.e-manuscripta.ch + + link_source_id | base_url | terminal_url +------------------------------+----------------------------------------------+------------------------------------------------------------------- + 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031 + 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064 + 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176 + 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200 + 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008 + +Historical docs, single pages, but do have full PDF downloads. + +NOTE: re-ingest + +## www.inderscience.com + +Previously: paywall (2021-05_daily_improvements.md) + +## www.un-ilibrary.org + + link_source_id | base_url | terminal_url +----------------------------+--------------------------------------------+------------------------------------------------------------- + 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307 + 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011 + 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014 + 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020 + 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005 + +Books and chapters. Doesn't seem to have actual download ability? + +# Re-Ingest / Re-Crawl + +Using fatcat-ingest helper tool. + +- www.isca-speech.org doi_prefix:10.21437 + doi:* doi_prefix:10.21437 in_ia:false + 9,233 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json + => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221}) +- repository.dri.ie doi_prefix:10.7486 + doi:* in_ia:false doi_prefix:10.7486 + 56,532 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json + => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532}) +- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link) + 25,598 + many are meeting abstracts + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json + => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598}) +- www.cureus.com doi_prefix:10.7759 + 1,537 + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json + => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535}) +- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta + 110,945 + TODO: all are marked 'unpublished', but that is actually probably right? +- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!) + doi:* in_ia:false doi_prefix:10.3389 + 212,370 + doi:10.3389/conf.* => most seem to be just abstracts? how many like this? + container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k) + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 191k + but many might be components? this is actually kind of a mess + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 19.2k + ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json + +# Remaining Tasks / Domains (TODO) + +more complex crawling/content: +- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url +- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) +- doi.ala.org.au: possible dataset ingest source +- peerj.com, at least reviews, should be HTML ingest? or are some PDF? +- publons.com should be HTML ingest, possibly special case for scope +- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug + +other tasks: +- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 +- push/deploy sandcrawler changes diff --git a/notes/ingest/NEXT.md b/notes/ingest/NEXT.md new file mode 100644 index 0000000..8cdd6df --- /dev/null +++ b/notes/ingest/NEXT.md @@ -0,0 +1,52 @@ + +biorxiv +medrxiv + doi:10.1101\/20* + +persee.fr 147k + publisher:persee in_ia:false is_oa:true + https://www.persee.fr/doc/pumus_1164-5385_1992_num_2_1_1013 + +cairn.info: 161k + doi_prefix:10.3917 in_ia:false is_oa:true + https://www.cairn.info/revue-afrique-contemporaine-2011-3-page-161.htm + https://www.cairn.info/revue-cahiers-de-psychologie-clinique-2014-1-page-209.htm + +IOP OA: 169k + doi_prefix:10.1088 is_oa:true in_ia:false + +indian journals platform? 124k + doi_prefix:10.4103 in_ia:false is_oa:true + http://www.urologyannals.com/article.asp?issn=0974-7796;year=2011;volume=3;issue=3;spage=138;epage=140;aulast=Ahmad + http://www.neurologyindia.com/article.asp?issn=0028-3886;year=2011;volume=59;issue=4;spage=612;epage=615;aulast=Utsuki + +openedition? 48k + doi_prefix:10.4000 is_oa:true in_ia:false + +german medical science (GMS) 28k + doi_prefix:10.3205 in_ia:false is_oa:true + https://www.egms.de/static/en/journals/zma/2015-32/zma000965.shtml + +siberian chemistry 28k + doi_prefix:10.2298 in_ia:false is_oa:true + http://www.doiserbia.nb.rs/Article.aspx?ID=0352-51391000105H + +jalc oa doi: 82k + doi_registrar:jalc in_ia:false is_oa:true + +sage OA papers + https://journals.sagepub.com/doi/10.1177/034003529802400510 + +Scientific Reports: 25k + in_ia:false container_id:"tnqhc2x2aneavcd3gx5h7mswhm" + +U Toronto press: 23k + publisher:"Toronto Press" in_ia:false is_oa:true + has an annoying bounce page + +ASHA (speech-language-hearing association): 7k + publisher:Speech-Language-Hearing in_ia:false is_oa:true + +MIT press journals + + diff --git a/notes/ingest/es_csv_to_json.py b/notes/ingest/es_csv_to_json.py new file mode 100755 index 0000000..4cd1811 --- /dev/null +++ b/notes/ingest/es_csv_to_json.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 + +""" + input like: + + doi,ident,"release_stage" + "10.7554/elife.38904",mxj534diw5gatc26rkif3io5xm,published + "10.7554/elife.41855",kag74qc6dfex7ftpfkf7iaus44,published + "10.7554/elife.41156",ienee5vxcbbbfhs2q54h4455hu,published + "10.7554/elife.43230",52rpllol2rcndjqs3xfwcldeka,published + "10.7554/elife.42591",fpz642gihrc3jd2vibg6gnjrxm,published + + output like: + + { + "base_url": "https://doi.org/10.7554/elife.38904", + "ext_ids": { + "doi": "10.7554/elife.38904" + }, + "fatcat_release": "mxj534diw5gatc26rkif3io5xm", + "release_stage": "published" + } +""" + +import csv, sys, json + +reader = csv.DictReader(sys.stdin) +for row in reader: + d = { + "base_url": "https://doi.org/{}".format(row['doi']), + "ext_ids": { + "doi": row['doi'], + }, + "fatcat_release": row['ident'], + "release_stage": row['release_stage'], + } + print(json.dumps(d)) |