diff options
Diffstat (limited to 'notes/ingest')
32 files changed, 10776 insertions, 0 deletions
diff --git a/notes/ingest/2020-04_unpaywall.md b/notes/ingest/2020-04_unpaywall.md index c900970..a5e3bb1 100644 --- a/notes/ingest/2020-04_unpaywall.md +++ b/notes/ingest/2020-04_unpaywall.md @@ -163,3 +163,150 @@ Start small: Looks good (whew), run the full thing: cat /grande/snapshots/unpaywall_all_recent_requests_2020-05-26.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Post-ingest stats (2020-08-28) + +Overall status: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 22063013 + no-pdf-link | 2192606 + redirect-loop | 1471135 + terminal-bad-status | 995106 + no-capture | 359440 + cdx-error | 358909 + wrong-mimetype | 111685 + wayback-error | 50705 + link-loop | 29359 + null-body | 13667 + gateway-timeout | 3689 + spn2-cdx-lookup-failure | 1229 + petabox-error | 1007 + redirects-exceeded | 747 + invalid-host-resolution | 464 + spn2-error | 107 + spn2-error:job-failed | 91 + bad-redirect | 26 + spn2-error:soft-time-limit-exceeded | 9 + bad-gzip-encoding | 5 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + -----------------------------------+---------------------+-------- + academic.oup.com | no-pdf-link | 415441 + watermark.silverchair.com | terminal-bad-status | 345937 + www.tandfonline.com | no-pdf-link | 262488 + journals.sagepub.com | no-pdf-link | 235707 + onlinelibrary.wiley.com | no-pdf-link | 225876 + iopscience.iop.org | terminal-bad-status | 170783 + www.nature.com | redirect-loop | 145522 + www.degruyter.com | redirect-loop | 131898 + files-journal-api.frontiersin.org | terminal-bad-status | 126091 + pubs.acs.org | no-pdf-link | 119223 + society.kisti.re.kr | no-pdf-link | 112401 + www.ahajournals.org | no-pdf-link | 105953 + dialnet.unirioja.es | terminal-bad-status | 96505 + www.cell.com | redirect-loop | 87560 + www.ncbi.nlm.nih.gov | redirect-loop | 49890 + ageconsearch.umn.edu | redirect-loop | 45989 + ashpublications.org | no-pdf-link | 45833 + pure.mpg.de | redirect-loop | 45278 + www.degruyter.com | terminal-bad-status | 43642 + babel.hathitrust.org | terminal-bad-status | 42057 + osf.io | redirect-loop | 41119 + scialert.net | no-pdf-link | 39009 + dialnet.unirioja.es | redirect-loop | 38839 + www.jci.org | redirect-loop | 34209 + www.spandidos-publications.com | redirect-loop | 33167 + www.journal.csj.jp | no-pdf-link | 30915 + journals.openedition.org | redirect-loop | 30409 + www.valueinhealthjournal.com | redirect-loop | 30090 + dergipark.org.tr | no-pdf-link | 29146 + journals.ametsoc.org | no-pdf-link | 29133 + (30 rows) + +Enqueue internal failures for re-ingest: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/unpaywall_errors_2020-08-28.rows.json'; + => 409606 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_errors_2020-08-28.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_errors_2020-08-28.requests.json + + cat /grande/snapshots/unpaywall_errors_2020-08-28.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +And after *that* (which ran quickly): + + status | count + -------------------------------------+---------- + success | 22281874 + no-pdf-link | 2258352 + redirect-loop | 1499251 + terminal-bad-status | 1004781 + no-capture | 401333 + wrong-mimetype | 112068 + cdx-error | 32259 + link-loop | 30137 + null-body | 13886 + wayback-error | 11653 + gateway-timeout | 3689 + spn2-cdx-lookup-failure | 1229 + petabox-error | 1036 + redirects-exceeded | 749 + invalid-host-resolution | 464 + spn2-error | 107 + spn2-error:job-failed | 91 + bad-redirect | 26 + spn2-error:soft-time-limit-exceeded | 9 + bad-gzip-encoding | 5 + (20 rows) + +22063013 -> 22281874 = + 218,861 success, not bad! diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md index 2f20415..fe22c75 100644 --- a/notes/ingest/2020-05_oai_pmh.md +++ b/notes/ingest/2020-05_oai_pmh.md @@ -181,3 +181,248 @@ Dump again for crawling: AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; +Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains: + + 4876135 www.kb.dk REMOVE: too large and generic + 3110009 kb-images.kb.dk REMOVE: dead? + 1274638 mdz-nbn-resolving.de REMOVE: maybe broken + 982312 aggr.ukm.um.si REMOVE: maybe broken + +And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so +expecting at least 11,052,439 `no-capture` ingest results (and should probably +filter for these or even delete from the ingest request table). + +Ingest progress: + + 2020-08-05 14:02: 32,571,018 + 2020-08-06 13:49: 31,195,169 + 2020-08-07 10:11: 29,986,169 + 2020-08-10 10:43: 26,497,196 + 2020-08-12 11:02: 23,811,845 + 2020-08-17 13:34: 19,460,502 + 2020-08-20 09:49: 15,069,507 + 2020-08-25 09:56: 9,397,035 + 2020-09-02 15:02: 305,889 (72k longest queue) + 2020-09-03 14:30: done + +## Post-ingest stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 16804277 + no-pdf-link | 14895249 + success | 13898603 + redirect-loop | 2709730 + cdx-error | 827024 + terminal-bad-status | 740037 + wrong-mimetype | 604242 + link-loop | 532553 + null-body | 95721 + wayback-error | 41864 + petabox-error | 19204 + | 15287 + gateway-timeout | 510 + bad-redirect | 318 + skip-url-blocklist | 184 + bad-gzip-encoding | 114 + timeout | 78 + spn2-cdx-lookup-failure | 59 + invalid-host-resolution | 19 + blocked-cookie | 6 + (20 rows) + +Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth +dumping the full kafka result topic, filter to OAI requests, and extracting the +missing URLs. + +Top counts by OAI prefix: + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 25; + + oai_prefix | success | total + --------------------------+---------+--------- + kb.dk | 0 | 7989412 (excluded) + repec | 1118591 | 2783448 + bnf.fr | 0 | 2187277 + hispana.mcu.es | 19404 | 1492639 + bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?) + hal | 564700 | 1049607 + ukm.si | 0 | 982468 (excluded) + hsp.org | 0 | 810281 + www.irgrid.ac.cn | 17578 | 748828 + cds.cern.ch | 72811 | 688091 + americanae.aecid.es | 69678 | 572792 + biodiversitylibrary.org | 2121 | 566154 + juser.fz-juelich.de | 22777 | 518551 + espace.library.uq.edu.au | 6494 | 508960 + igi.indrastra.com | 58689 | 478577 + archive.ugent.be | 63654 | 424014 + hrcak.srce.hr | 395031 | 414897 + zir.nsk.hr | 153889 | 397200 + renati.sunedu.gob.pe | 78399 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7963 | 354529 + generic.eprints.org | 261221 | 340470 + invenio.nusl.cz | 6184 | 325867 + evastar-karlsruhe.de | 62044 | 317952 + quod.lib.umich.edu | 5 | 309135 + (25 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 30; + + + oai_prefix | status | count + --------------------------+---------------+--------- + kb.dk | no-capture | 7955231 (excluded) + bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?) + repec | success | 1118591 + hispana.mcu.es | no-pdf-link | 1118092 + bnf.fr | no-capture | 1100591 + ukm.si | no-capture | 976004 (excluded) + hsp.org | no-pdf-link | 773496 + repec | no-pdf-link | 625629 + bnf.fr | no-pdf-link | 607813 + hal | success | 564700 + biodiversitylibrary.org | no-pdf-link | 531409 + cds.cern.ch | no-capture | 529842 + repec | redirect-loop | 504393 + juser.fz-juelich.de | no-pdf-link | 468813 + bnf.fr | redirect-loop | 436087 + americanae.aecid.es | no-pdf-link | 409954 + hrcak.srce.hr | success | 395031 + www.irgrid.ac.cn | no-pdf-link | 362087 + hal | no-pdf-link | 352111 + www.irgrid.ac.cn | no-capture | 346963 + espace.library.uq.edu.au | no-pdf-link | 315302 + igi.indrastra.com | no-pdf-link | 312087 + repec | no-capture | 309882 + invenio.nusl.cz | no-pdf-link | 302657 + hypotheses.org | no-pdf-link | 298750 + rour.neicon.ru | redirect-loop | 291922 + renati.sunedu.gob.pe | no-capture | 276388 + t2r2.star.titech.ac.jp | no-pdf-link | 264109 + generic.eprints.org | success | 261221 + quod.lib.umich.edu | no-pdf-link | 253937 + (30 rows) + +If we remove excluded prefixes, and some large/generic prefixes (bnf.fr, +hispana.mcu.es, hsp.org), then the aggregate counts are: + + no-capture | 16,804,277 -> 5,502,242 + no-pdf-link | 14,895,249 -> 12,395,848 + +Top status by terminal domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + WHERE t1.domain != '' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ----------------------------------+---------------+-------- + hispana.mcu.es | no-pdf-link | 709701 (national scope) + gallica.bnf.fr | no-pdf-link | 601193 (national scope) + discover.hsp.org | no-pdf-link | 524212 (historical) + www.biodiversitylibrary.org | no-pdf-link | 479288 + gallica.bnf.fr | redirect-loop | 435981 (national scope) + hrcak.srce.hr | success | 389673 + hemerotecadigital.bne.es | no-pdf-link | 359243 + juser.fz-juelich.de | no-pdf-link | 345112 + espace.library.uq.edu.au | no-pdf-link | 304299 + invenio.nusl.cz | no-pdf-link | 302586 + igi.indrastra.com | no-pdf-link | 292006 + openrepository.ru | redirect-loop | 291555 + hal.archives-ouvertes.fr | success | 278134 + t2r2.star.titech.ac.jp | no-pdf-link | 263971 + bib-pubdb1.desy.de | no-pdf-link | 254879 + quod.lib.umich.edu | no-pdf-link | 250382 + encounters.hsp.org | no-pdf-link | 248132 + americanae.aecid.es | no-pdf-link | 245295 + www.irgrid.ac.cn | no-pdf-link | 242496 + publikationen.bibliothek.kit.edu | no-pdf-link | 222041 + www.sciencedirect.com | no-pdf-link | 211756 + dialnet.unirioja.es | redirect-loop | 203615 + edoc.mpg.de | no-pdf-link | 195526 + bibliotecadigital.jcyl.es | no-pdf-link | 184671 + hal.archives-ouvertes.fr | no-pdf-link | 183809 + www.sciencedirect.com | redirect-loop | 173439 + lup.lub.lu.se | no-pdf-link | 165788 + orbi.uliege.be | no-pdf-link | 158313 + www.erudit.org | success | 155986 + lib.dr.iastate.edu | success | 153384 + (30 rows) + +Follow-ups are TBD but could include: +- crawling the ~5m no-capture links directly (eg, not `base_url`) from the + ingest result JSON, while retaining the ingest request for later re-ingest +- investigating and iterating on PDF link extraction, both for large platforms + and randomly sampled from long tail +- classifying OAI prefixes by type (subject repository, institutional + repository, journal, national-library, historical docs, greylit, law, etc) +- running pdftrio over some/all of this corpus diff --git a/notes/ingest/2020-05_pubmed.md b/notes/ingest/2020-05_pubmed.md new file mode 100644 index 0000000..36d00a1 --- /dev/null +++ b/notes/ingest/2020-05_pubmed.md @@ -0,0 +1,10 @@ + +From ARXIV-PUBMEDCENTRAL-CRAWL-2020-04, on fatcat-prod1. + +Test small batch: + + zcat ingest_file_pmcid_20200424.json.gz | head -n200 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Run the whole batch: + + zcat ingest_file_pmcid_20200424.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2020-07_mag.md b/notes/ingest/2020-07_mag.md new file mode 100644 index 0000000..1d33162 --- /dev/null +++ b/notes/ingest/2020-07_mag.md @@ -0,0 +1,353 @@ + +Using 2020-06-25 upstream MAG corpus snapshot. + +Ran munging from `scratch:ingest/mag` notes first. + +Expecting a couple million new ingest request URLs; this is the first "patch" +MAG ingest on top of existing already-run requests. + +Planning to skip the initial bulk ingest step, on the assumption that new URLs +have either been ingested already (eg, via continuous ingest pipeline) or need +crawling. + +## Generate Requests + + export LC_ALL=C + cat PaperUrls_mag_url_doi.all.txt | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 | pv -l > ingest_requests_mag-2020-06-25.json + => 28.7M 2:36:48 [3.06k/s] + + export LC_ALL=C + zcat PaperUrls_mag_url_pmid.txt.gz | rg -a -v arxiv.org | rg -a "://" | ./mag_ingest_request.py - --created-date 2020-06-25 --pmid | pv -l > ingest_requests_mag-2020-06-25.pmid.json + => 5.66M 0:29:28 [ 3.2k/s] + +## Persist Ingest Requests + + # small sample + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 319, 'update-requests': 0}) + + head -n1000 /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + Worker: Counter({'total': 1000, 'insert-requests': 304, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.pmid.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 5662486, 'insert-requests': 1984605, 'update-requests': 0}) + + cat /schnell/mag/20200625/ingest_requests_mag-2020-06-25.json | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 28743819, 'insert-requests': 7433465, 'update-requests': 0}) + +## Crawl/Dupe Status + +Overall status for old and new seeds, filtering out large (blocking) +publishers: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 19477651 + | 8238898 + redirect-loop | 2036494 + link-loop | 1330036 + no-pdf-link | 1304820 + terminal-bad-status | 648150 + no-capture | 545785 + gateway-timeout | 200143 + cdx-error | 149995 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 57052 + wayback-error | 41032 + invalid-host-resolution | 37203 + petabox-error | 11167 + null-body | 6662 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 77 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + | 8238851 + success | 787174 + no-capture | 42864 + redirect-loop | 31718 + terminal-bad-status | 31493 + no-pdf-link | 13025 + cdx-error | 11275 + wrong-mimetype | 6238 + link-loop | 3365 + wayback-error | 748 + gateway-timeout | 506 + null-body | 191 + spn2-cdx-lookup-failure | 99 + petabox-error | 89 + invalid-host-resolution | 70 + spn2-error | 7 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + bad-gzip-encoding | 1 + (19 rows) + +Where are no-capture results terminating? May need to add or update heritrix +crawl config so that we get better yield without needing to do SPNv2 crawling. + + SELECT initial_domain, terminal_domain, COUNT(*) + FROM ( + SELECT + ingest_file_result.status as status, + substring(ingest_file_result.base_url FROM '[^/]+://([^/]*)') AS initial_domain, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS terminal_domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + ) t1 + GROUP BY initial_domain, terminal_domain + ORDER BY COUNT DESC + LIMIT 25; + + initial_domain | terminal_domain | count + ---------------------------------+---------------------+-------- + www.researchgate.net | | 334145 + academic.oup.com | | 205820 + www.tandfonline.com | | 148638 + journals.sagepub.com | | 144196 + muse.jhu.edu | | 55957 + hrcak.srce.hr | | 25317 + www.omicsonline.org | | 22426 + link.springer.com | | 21044 + iopscience.iop.org | | 12385 + bioone.org | | 9097 + tandfonline.com | | 8512 + or.nsfc.gov.cn | | 4823 + ieeexplore.ieee.org | ieeexplore.ieee.org | 4398 + pubs.acs.org | | 3708 + archive-ouverte.unige.ch | | 2743 + dergipark.ulakbim.gov.tr | | 2677 + hal.archives-ouvertes.fr | | 1258 + dergipark.org.tr | | 1207 + apo.org.au | | 1186 + spire.sciencespo.fr | | 989 + cyberleninka.ru | | 895 + lirias.kuleuven.be | | 855 + tel.archives-ouvertes.fr | | 786 + pub.uni-bielefeld.de | | 728 + www.research-collection.ethz.ch | | 670 + (25 rows) + +## Heritrix Seedlist Generation + +Dump ingest requests (filtered for some domains that don't expect to crawl via +heritrix): + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status IS NULL) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200708.rows.json'; + => 8784683 + + # in sandcrawler pipenv + ./scripts/ingestrequest_row2json.py /grande/snapshots/mag_nocapture_20200708.rows.json > /grande/snapshots/mag_nocapture_20200708.json + +Seedlist transform from here on covered in MAG crawl notes. + +## Bulk Ingest + +Run ingest requests on everything we crawled: + + cat /grande/snapshots/mag_nocapture_20200708.json | | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Small sample: + + head -n1000 /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Full run: + + cat /grande/snapshots/mag_nocapture_20200708.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Updated Overall Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24574294 + redirect-loop | 2633731 + no-capture | 2458694 + no-pdf-link | 1896871 + link-loop | 1510899 + terminal-bad-status | 878821 + cdx-error | 387574 + gateway-timeout | 200246 + | 170304 + wayback-error | 97572 + spn2-cdx-lookup-failure | 80284 + wrong-mimetype | 65097 + invalid-host-resolution | 37204 + petabox-error | 12097 + null-body | 8549 + spn2-error | 1706 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + (20 rows) + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 24557382 + redirect-loop | 2630582 + no-capture | 1947066 + no-pdf-link | 1778206 + link-loop | 1510790 + terminal-bad-status | 857173 + cdx-error | 384525 + gateway-timeout | 200143 + wayback-error | 96390 + spn2-cdx-lookup-failure | 80010 + wrong-mimetype | 64908 + invalid-host-resolution | 37203 + petabox-error | 12087 + null-body | 8548 + spn2-error | 1698 + spn2-error:job-failed | 775 + spn2-error:invalid-url-syntax | 335 + spn2-error:soft-time-limit-exceeded | 191 + bad-redirect | 90 + | 69 + (20 rows) + +Just the new seeds: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.created > '2020-06-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+--------- + success | 5860601 + no-capture | 1489959 + redirect-loop | 619121 + no-pdf-link | 473703 + terminal-bad-status | 234753 + cdx-error | 231575 + link-loop | 184093 + wayback-error | 56068 + wrong-mimetype | 14046 + null-body | 2068 + petabox-error | 1006 + gateway-timeout | 506 + spn2-cdx-lookup-failure | 99 + invalid-host-resolution | 70 + | 22 + bad-redirect | 13 + spn2-error | 7 + timeout | 3 + spn2-error:job-failed | 2 + spn2-error:soft-time-limit-exceeded | 1 + (20 rows) + diff --git a/notes/ingest/2020-08_daily_improvements.md b/notes/ingest/2020-08_daily_improvements.md new file mode 100644 index 0000000..da57065 --- /dev/null +++ b/notes/ingest/2020-08_daily_improvements.md @@ -0,0 +1,202 @@ + +Goal is to increase rate of successful daily changelog crawling, but reduce +wasted attempts. + +Status by domain, past 30 days: + + domain | status | count + --------------------------------------+-----------------+------- + arxiv.org | success | 21792 + zenodo.org | success | 10646 + res.mdpi.com | success | 10449 + springernature.figshare.com | no-pdf-link | 10430 + s3-eu-west-1.amazonaws.com | success | 8966 + zenodo.org | no-pdf-link | 8137 + hkvalidate.perfdrive.com | no-pdf-link | 5943 + www.ams.org:80 | no-pdf-link | 5799 + assets.researchsquare.com | success | 4651 + pdf.sciencedirectassets.com | success | 4145 + fjfsdata01prod.blob.core.windows.net | success | 3500 + sage.figshare.com | no-pdf-link | 3174 + onlinelibrary.wiley.com | no-pdf-link | 2869 + www.e-periodica.ch | no-pdf-link | 2709 + revistas.uned.es | success | 2631 + figshare.com | no-pdf-link | 2500 + www.sciencedirect.com | link-loop | 2477 + linkinghub.elsevier.com | gateway-timeout | 1878 + downloads.hindawi.com | success | 1819 + www.scielo.br | success | 1691 + jps.library.utoronto.ca | success | 1590 + www.ams.org | no-pdf-link | 1568 + digi.ub.uni-heidelberg.de | no-pdf-link | 1496 + research-repository.griffith.edu.au | success | 1412 + journals.plos.org | success | 1330 + (25 rows) + +Status by DOI prefix, past 30 days: + + doi_prefix | status | count + ------------+-------------------------+------- + 10.6084 | no-pdf-link | 14410 <- figshare; small fraction success + 10.6084 | success | 4007 + 10.6084 | cdx-error | 1746 + + 10.13140 | gateway-timeout | 9689 <- researchgate + 10.13140 | cdx-error | 4154 + + 10.5281 | success | 9408 <- zenodo + 10.5281 | no-pdf-link | 6079 + 10.5281 | cdx-error | 3200 + 10.5281 | wayback-error | 2098 + + 10.1090 | no-pdf-link | 7420 <- AMS (ams.org) + + 10.3390 | success | 6599 <- MDPI + 10.3390 | cdx-error | 3032 + 10.3390 | wayback-error | 1636 + + 10.1088 | no-pdf-link | 3227 <- IOP science + + 10.1101 | gateway-timeout | 3168 <- coldspring harbor: press, biorxiv, medrxiv, etc + 10.1101 | cdx-error | 1147 + + 10.21203 | success | 3124 <- researchsquare + 10.21203 | cdx-error | 1181 + + 10.1016 | success | 3083 <- elsevier + 10.1016 | cdx-error | 2465 + 10.1016 | gateway-timeout | 1682 + 10.1016 | wayback-error | 1567 + + 10.25384 | no-pdf-link | 3058 <- sage figshare + 10.25384 | success | 2456 + + 10.1007 | gateway-timeout | 2913 <- springer + 10.1007 | cdx-error | 1164 + + 10.5944 | success | 2831 + 10.1186 | success | 2650 + 10.5169 | no-pdf-link | 2644 <- www.e-periodica.ch + 10.3389 | success | 2279 + 10.24411 | gateway-timeout | 2184 <- cyberleninka.ru + 10.1038 | gateway-timeout | 2143 <- nature group + 10.1177 | gateway-timeout | 2038 <- SAGE + 10.11588 | no-pdf-link | 1574 <- journals.ub.uni-heidelberg.de (OJS?) + 10.25904 | success | 1416 + 10.1155 | success | 1304 + 10.21994 | no-pdf-link | 1268 <- loar.kb.dk + 10.18720 | spn2-cdx-lookup-failure | 1232 <- elib.spbstu.ru + 10.24411 | cdx-error | 1202 + 10.1055 | no-pdf-link | 1170 <- thieme-connect.de + (40 rows) + +code changes for ingest: +x hkvalidate.perfdrive.com: just bail when we see this +x skip large publishers which gateway-timeout (for now) + - springerlink (10.1007) + - nature group (10.1038) + - SAGE (10.1177) + - IOP (10.1088) + +fatcat: +x figshare (by `doi_prefix`): if not versioned (suffix), skip crawl +x zenodo: also try to not crawl if unversioned (group) +x figshare import metadata + +sandcrawler: +x ends with `cookieAbsent` or `cookieSet=1` -> status as cookie-blocked +x https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist +x verify that we do quick-get for arxiv.org + europmc.org (+ figshare/zenodo?) + => we were not! +x shorten post-SPNv2 CDX pause? for throughput, given that we are re-trying anyways +x ensure that we store uncrawled URL somewhere on no-capture status + => in HTML or last of hops + => not in DB, but that is a bigger change + +- try to get un-blocked: + - coldspring harbor has been blocking since 2020-06-22? yikes! + - cyberleninka.ru + - arxiv.org + +- no-pdf-link + x www.ams.org (10.1090) + => these seem to be stale captures, eg from 2008. newer captures have citation_pdf_url + => should consider recrawling all of ams.org? + => not sure why these crawl requests are happening only now + => on the order of 15k OA articles not in ia; 43k total not preserved + => force recrawl OA subset (DONE) + x www.e-periodica.ch (10.5169) + => TODO: dump un-preserved URLs, transform to PDF urls, heritrix crawl, re-ingest + x digi.ub.uni-heidelberg.de (10.11588) + => TODO: bulk re-enqueue? then heritrix crawl? + - https://loar.kb.dk/handle/1902/6988 (10.21994) + => TODO: bulk re-enqueue + => site was updated recently (august 2020); now it crawls fine. need to re-ingest all? + => 7433 hits + - thieme-connect.de (10.1055) + => 600k+ missing + => TODO: bulk re-enqueue? then heritrix crawl? + => https://profile.thieme.de/HTML/sso/ejournals/login.htm[...] => blocklist + => generally just need to re-crawl all? + +Unresolved: +- why so many spn2-errors on https://elib.spbstu.ru/ (10.18720)? + +## figshare + +10.6084 regular figshare +10.25384 SAGE figshare + +For sage, "collections" are bogus? can we detect these in datacite metadata? + +If figshare types like: + + ris: "GEN", + bibtex: "misc", + citeproc: "article", + schemaOrg: "Collection", + resourceType: "Collection", + resourceTypeGeneral: "Collection" + +then mark as 'stub'. + +"Additional file" items don't seem like "stub"; -> "component". + +title:"Figure {} from " -> component + +current types are mostly: article, stub, dataset, graphic, article-journal + +If DOI starts with "sage.", then publisher is "Sage" (not figshare). Container +name should be... sage.figshare.com? + +set version to the version from DOI + +## zenodo + +doi_prefix: 10.5281 + +if on zenodo, and has a "Identical to" relation, then this is a pre-print. in +that case, drop container_id and set container_name to zenodo.org. *But*, there +are some journals now publishing exclusively to zenodo.org, so retain that +metadata. examples: + + "Detection of keyboard vibrations and effects on perceived piano quality" + https://fatcat.wiki/release/mufzkdgt2nbzfha44o7p7gkrpy + + "Editing LAF: Educate, don't defend!" + https://zenodo.org/record/2583025 + +version number not available in zenodo metadata + +## Gitlab MR Notes + +The main goal of this group of changes is to do a better job at daily ingest. + +Currently we have on the order of 20k new releases added to the index every day, and about half of them get are marked as OA (either CC license or via container being in DOAJ or ROAD), and pass some filters (eg, release_type), and are selected for ingest. Of those, about half fail to crawl to fulltext, either due to blocking (gateway-timeout, cookie tests, anti-bot detection, loginwall, etc). On the other hand, we don't attempt to crawl lots of "bronze" OA, which is content that is available from the publisher website, but isn't marked explicitly OA. + +Based on investigating daily crawling from the past month (will commit these notes to sandcrawler soon), I have identified some DOI prefixes that almost always fail ingest via SPNv2. I also have some patches to sandcrawler ingest to improve ability to crawl some large repositories etc. + +Some of the biggest "OA but failed to crawl" are from figshare and zenodo, which register a relatively large fraction of daily OA DOIs. We want to crawl most of that content, but both of these platforms register at least DOIs for each piece of content (a "group" DOI and a "versioned" DOI), and we only need to crawl one. There were also some changes needed to release-type filtering and assignment specific to these platforms, or based on the title of entities. + +This MR mixes changes to the datacite metadata import routing (including some refactors out of the main parse_record method) and behavior changes to the entity updater (which is where the code to decide about whether to send an ingest request on release creation lives). I will have a separate MR for importer metadata changes that don't impact ingest behavior. + diff --git a/notes/ingest/2020-09_oa_doi.md b/notes/ingest/2020-09_oa_doi.md new file mode 100644 index 0000000..f5c853d --- /dev/null +++ b/notes/ingest/2020-09_oa_doi.md @@ -0,0 +1,352 @@ + +It seems that many gold OA DOIs on were not ingesting simply because the HTML +url extraction was not working for a particular version of OJS. + +Let's re-try all ~2.5 million of these in bulk mode and see how many are +'no-capture' vs. other errors, then possibly re-crawl a large number. + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py query 'is_oa:true preservation:none !arxiv_id:* !pmcid:* !publisher_type:big5 type:article-journal' | pv -l > /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json + Expecting 2569876 release objects in search queries + Counter({'elasticsearch_release': 2569880, 'estimate': 2569880, 'ingest_request': 2063034}) + +Enqueue + + cat /srv/fatcat/snapshots/oa_doi_20200915.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started at about: + + Thu Sep 17 00:15:00 UTC 2020 + 2020-09-17T00:15:00Z + +## Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND ingest_file_result.updated >= '2020-09-16' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 513462 + success | 206042 + no-pdf-link | 186779 + terminal-bad-status | 40372 + redirect-loop | 33103 + cdx-error | 24078 + link-loop | 13494 + spn2-cdx-lookup-failure | 10247 + gateway-timeout | 4407 + wrong-mimetype | 3213 + petabox-error | 866 + null-body | 449 + spn2-error | 217 + wayback-error | 129 + spn2-error:job-failed | 64 + bad-redirect | 6 + spn2-error:soft-time-limit-exceeded | 1 + (17 rows) + +This was only about half the requests. Try... broader? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -------------------------------------+-------- + no-capture | 579952 + success | 387325 + no-pdf-link | 380406 + terminal-bad-status | 63743 + redirect-loop | 53893 + cdx-error | 46024 + spn2-cdx-lookup-failure | 28347 + link-loop | 22573 + gateway-timeout | 11686 + wrong-mimetype | 6294 + null-body | 3509 + petabox-error | 2388 + spn2-error | 1023 + spn2-error:job-failed | 462 + wayback-error | 347 + spn2-error:soft-time-limit-exceeded | 20 + bad-redirect | 11 + (17 rows) + +What top domains for those `no-pdf-link` (or similar)? + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.status != 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 56488 + figshare.com | no-pdf-link | 55337 + www.egms.de | redirect-loop | 22686 + zenodo.org | terminal-bad-status | 22128 + tandf.figshare.com | no-pdf-link | 20027 + springernature.figshare.com | no-pdf-link | 17181 + cairn.info | terminal-bad-status | 13836 + www.persee.fr | terminal-bad-status | 7565 + projecteuclid.org | link-loop | 7449 + www.cairn.info | no-pdf-link | 6992 + scialert.net | no-pdf-link | 6621 + www.cairn.info | link-loop | 5870 + utpjournals.press | no-pdf-link | 5772 + journals.openedition.org | redirect-loop | 5464 + www.egms.de | no-pdf-link | 5223 + archaeologydataservice.ac.uk | no-pdf-link | 4881 + rs.figshare.com | no-pdf-link | 4773 + www.degruyter.com | spn2-cdx-lookup-failure | 4763 + koreascience.or.kr | no-pdf-link | 4487 + cancerres.aacrjournals.org | no-pdf-link | 4124 + cms.math.ca | no-pdf-link | 3441 + volcano.si.edu | no-pdf-link | 3424 + www.mathnet.ru | no-pdf-link | 3229 + tidsskriftet.no | no-pdf-link | 3012 + journals.plos.org | no-pdf-link | 3005 + tudigit.ulb.tu-darmstadt.de | no-pdf-link | 2796 + www.cairn.info:80 | link-loop | 2647 + hammer.figshare.com | no-pdf-link | 2627 + www.psychosocial.com | no-pdf-link | 2457 + osf.io | terminal-bad-status | 2388 + (30 rows) + +Should look at link extraction for: + +- scialert.net +- utpjournals.press +- koreascience.or.kr +- cancerres.aacrjournals.org +- cms.math.ca +- volcano.si.edu +- www.mathnet.ru +- www.psychosocial.com + +## Re-Ingest + +Re-run ingest to handle `no-capture` cases, to extract the missing terminal URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-09-15' + AND ingest_file_result.updated <= '2020-09-20' + AND ingest_file_result.status = 'no-capture' + -- AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + ) TO '/grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json'; + => COPY 579952 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_reingest_nocapture_20201012.rows.json | pv -l | shuf > /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json + => 579k 0:00:22 [25.9k/s] + + cat /grande/snapshots/oa_doi_reingest_nocapture_20201012.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Resuming progress on this in early December 2020. + +Filtered requests to re-crawl: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ((ingest_file_result.updated >= '2020-09-15' AND ingest_file_result.updated <= '2020-09-20') + OR (ingest_file_result.updated >= '2020-10-11')) + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json'; + => COPY 2352614 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | pv -l > /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json + +And actually dump seedlist(s): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | jq -r .base_url | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | rg '://' | sort -u -S 4G > /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + + wc -l /grande/snapshots/oa_doi_seedlist_2020-12-08.*.txt + 2352614 /grande/snapshots/oa_doi_seedlist_2020-12-08.base_url.txt + 481910 /grande/snapshots/oa_doi_seedlist_2020-12-08.no_capture_terminal_url.txt + +Top DOI prefixes (same old usual suspects): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg "://doi.org/" | cut -f4 -d/ | sort | uniq -c | sort -nr | head -n20 + 353695 10.5281 zenodo.org + 121888 10.6084 figshare.org + 115093 10.3917 cairn.info + 113252 10.3406 persee.fr + 95414 10.1515 degruyter.com + 90448 10.4324 taylorfrancis.com + 83927 10.1016 elsevier + 60303 10.1109 IEEE + 48490 10.4000 openedition.org + 28498 10.3205 egms.de + 23433 10.1163 brill.com + 23276 10.17615 cdr.lib.unc.edu + 21386 10.1093 oup.com + 20783 10.3138 utpjournals.press + 19987 10.1201 tandfonline.com + 17916 10.34847 cocoon.huma-num.fr + 16970 10.1002 wiley.com + 15958 10.1097 lww.com (and others?) + 15835 10.1017 cambridge.org + 15466 10.24355 publikationsserver.tu-braunschweig.de (IR) + +Top domains (not doi.org): + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.*url.txt | rg ^http | rg -v "://doi.org/" | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 104148 zenodo.org + 85245 www.persee.fr + 52931 www.cairn.info + 4791 www.jstage.jst.go.jp + 4411 archive.monthlyreview.org + 4129 osf.io + 2841 www.indianjournals.com + 2746 www.impan.pl + 2620 platform.almanhal.com + 2019 www.nomos-elibrary.de + 1209 dergipark.org.tr + 1027 pubs.geoscienceworld.org + 973 www.pdcnet.org + 923 www.hanspub.org + 914 www.repository.cam.ac.uk + 863 mediarep.org + 812 www.cartographicperspectives.org + 687 www.degruyter.com + 578 192.168.7.24 + 566 journals.eco-vector.com + +TODO: infer `publisher_type` and platform from DOI prefix in more cases + +## Re-Ingest + +Crawl has completed. Starting this bulk ingest on 2020-12-31; roughly 2.3 +million requests. Note these are all `pdf` requests, but crawl was done in an +HTML-friendly way, so should be able to do domain/journal-specific HTML ingests +in the future. + + cat /grande/snapshots/oa_doi_seedlist_2020-12-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Stats, for this ingest period (fuzzy; will have some daily ingest stuff): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND (ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog') + AND ingest_file_result.updated >= '2020-12-28' + AND ingest_request.created <= '2020-12-09' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 30; + + status | count + -----------------------+-------- + no-pdf-link | 962714 + success | 539305 + no-capture | 306590 + redirect-loop | 192149 + link-loop | 184797 + terminal-bad-status | 141721 + wrong-mimetype | 10362 + null-body | 10277 + skip-url-blocklist | 1985 + wayback-content-error | 1300 + cdx-error | 869 + petabox-error | 160 + bad-redirect | 72 + wayback-error | 46 + bad-gzip-encoding | 7 + timeout | 1 + max-hops-exceeded | 1 + (17 rows) + diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md new file mode 100644 index 0000000..ec4e536 --- /dev/null +++ b/notes/ingest/2020-09_reingest.md @@ -0,0 +1,197 @@ + +Goal: re-bulk-ingest some older existing crawls which hung on errors like +`cdx-error` or `wayback-error`, indicating that ingest might actually succeed +on retry. + +Sources: +- unpaywall (again) +- doi (ingest, changelog, etc) +- mag +- oai + +## DOI + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 25; + + status | count + -------------------------------------+--------- + no-pdf-link | 8304582 + success | 3461708 + no-capture | 1881269 + redirect-loop | 1851541 + gateway-timeout | 355820 + cdx-error | 341848 + terminal-bad-status | 328650 + skip-url-blocklist | 220474 + spn2-cdx-lookup-failure | 125521 + link-loop | 109352 + wayback-error | 101525 + null-body | 73539 + wrong-mimetype | 53151 + spn-error | 13579 + spn2-error | 6848 + spn2-error:job-failed | 4381 + spn-remote-error | 4180 + other-mimetype | 2305 + petabox-error | 904 + timeout | 710 + spn2-error:soft-time-limit-exceeded | 557 + spn2-error:proxy-error | 437 + spn2-error:browser-running-error | 273 + invalid-host-resolution | 233 + pending | 116 + (25 rows) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json'; + => 443421 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +Additional 27,779 success status? Hard to tell because lots of other ingest +running in parallel. + +Live: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'spn-error' OR + ingest_file_result.status = 'spn2-cdx-lookup-failure' OR + ingest_file_result.status = 'spn2-error:job-failed' OR + ingest_file_result.status = 'spn2-error:proxy-error' + ) + ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json'; + => 143984 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Unpaywall (again) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json'; + => 43912 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## MAG + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json'; + => 188175 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## OAI-PMH + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json'; + => 851056 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +--------- + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + diff --git a/notes/ingest/2020-09_scielo.md b/notes/ingest/2020-09_scielo.md new file mode 100644 index 0000000..4ec6fbd --- /dev/null +++ b/notes/ingest/2020-09_scielo.md @@ -0,0 +1,21 @@ + +As a follow-up to `SCIELO-CRAWL-2020-07`, going to bulk ingest all existing +fatcat releases with no IA copy and with `publisher_type:scielo`. There are +200k+ such releases. + +It seems like some of these are HTML or XML, eg: https://doi.org/10.4321/s1132-12962011000300008 + +Could try XML ingest of these! + +## Bulk Ingest + +Dump ingest requests + + ./fatcat_ingest.py --allow-non-oa query "publisher_type:scielo" | pv -l > /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json + Expecting 212529 release objects in search queries + +Enqueue + + cat /srv/fatcat/snapshots/scielo_papers_20200914.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done 2020-09-14 + diff --git a/notes/ingest/2020-10_daily.md b/notes/ingest/2020-10_daily.md new file mode 100644 index 0000000..d2bb50b --- /dev/null +++ b/notes/ingest/2020-10_daily.md @@ -0,0 +1,193 @@ + +Quick notes on how daily ingest is going, circa September/October 2020. + + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) + ORDER BY date(ingest_request.created) DESC; + + ingest_type | date | total | success + -------------+------------+-------+--------- + pdf | 2020-10-10 | 6145 | 1368 + pdf | 2020-10-09 | 28453 | 6461 + pdf | 2020-10-08 | 15105 | 3803 + pdf | 2020-10-07 | 34213 | 10813 + pdf | 2020-10-06 | 22263 | 8565 + pdf | 2020-10-05 | 7910 | 3200 + pdf | 2020-10-04 | 10865 | 4579 + pdf | 2020-10-03 | 27745 | 10818 + pdf | 2020-10-02 | 34320 | 13523 + pdf | 2020-10-01 | 32548 | 13252 + pdf | 2020-09-30 | 34798 | 14113 + pdf | 2020-09-29 | 22463 | 8328 + pdf | 2020-09-28 | 4117 | 1278 + pdf | 2020-09-27 | 5894 | 1732 + pdf | 2020-09-26 | 34949 | 13901 + pdf | 2020-09-25 | 33680 | 10605 + pdf | 2020-09-24 | 15125 | 5785 + pdf | 2020-09-23 | 20866 | 6584 + pdf | 2020-09-22 | 20949 | 7167 + pdf | 2020-09-21 | 22483 | 7308 + pdf | 2020-09-20 | 45644 | 16981 + pdf | 2020-09-19 | 95571 | 31991 + pdf | 2020-09-18 | 50849 | 15875 + pdf | 2020-09-17 | 20121 | 3158 + pdf | 2020-09-16 | 39184 | 12150 + pdf | 2020-09-15 | 16986 | 7705 + (26 rows) + + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------------------+-------- + pdf | success | 241047 + pdf | no-pdf-link | 143084 + pdf | spn2-cdx-lookup-failure | 108311 + pdf | gateway-timeout | 97250 + pdf | cdx-error | 61820 + pdf | link-loop | 31350 + pdf | wayback-error | 9139 + pdf | spn2-error:job-failed | 4240 + pdf | spn2-error | 3893 + pdf | wrong-mimetype | 1010 + pdf | no-capture | 851 + pdf | null-body | 605 + pdf | redirect-loop | 261 + pdf | spn2-error:soft-time-limit-exceeded | 126 + pdf | terminal-bad-status | 120 + pdf | petabox-error | 105 + pdf | timeout | 29 + pdf | spn2-error:no-status | 2 + pdf | spn2-error:invalid-server-response | 2 + pdf | bad-gzip-encoding | 1 + (20 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + + domain | status | count + ------------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 52767 + www.degruyter.com | link-loop | 17666 + www.degruyter.com | spn2-cdx-lookup-failure | 17597 + ieeexplore.ieee.org | gateway-timeout | 15290 + www.sciencedirect.com | no-pdf-link | 14043 + apps.crossref.org | no-pdf-link | 11531 + figshare.com | no-pdf-link | 8966 + tandf.figshare.com | no-pdf-link | 7276 + zenodo.org | no-capture | 7191 + springernature.figshare.com | no-pdf-link | 6485 + www.taylorfrancis.com | link-loop | 6266 + www.persee.fr | terminal-bad-status | 6031 + journals.openedition.org | gateway-timeout | 5639 + www.cairn.info | link-loop | 5618 + archaeologydataservice.ac.uk | no-pdf-link | 5359 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 4748 + www.e-periodica.ch | no-pdf-link | 4722 + osf.io | no-capture | 4247 + cancerres.aacrjournals.org | no-pdf-link | 4136 + dlc.library.columbia.edu | no-pdf-link | 4085 + www.egms.de | no-pdf-link | 3304 + journals.lww.com | no-pdf-link | 3218 + journals.plos.org | no-pdf-link | 3005 + linkinghub.elsevier.com | gateway-timeout | 2833 + www.egms.de | redirect-loop | 2606 + (25 rows) + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------------------+---------+------- + zenodo.org | success | 55549 + arxiv.org | success | 24450 + s3-eu-west-1.amazonaws.com | success | 18156 + res.mdpi.com | success | 13493 + www.degruyter.com | success | 12009 + journals.openedition.org | success | 11235 + www.jstage.jst.go.jp | success | 9460 + peer.asee.org | success | 9416 + www.e-periodica.ch | success | 8105 + ir.canterbury.ac.nz | success | 6381 + europepmc.org | success | 5670 + www.repository.cam.ac.uk | success | 4858 + assets.researchsquare.com | success | 4765 + fjfsdata01prod.blob.core.windows.net | success | 4130 + tidsskrift.dk | success | 3964 + research-journal.org | success | 3127 + ieeexplore.ieee.org | success | 2947 + dergipark.org.tr | success | 2892 + watermark.silverchair.com | success | 2315 + journals.plos.org | success | 2304 + journal.fi | success | 1996 + publications.rwth-aachen.de | success | 1954 + www.brazilianjournals.com | success | 1637 + article.sciencepublishinggroup.com | success | 1589 + revistas.upr.edu | success | 1467 + (25 rows) + +Casual take-aways: +- wonder what `apps.crossref.org` is +- sciencedirect crawling broken? +- figshare might be broken? or just very little success +- seems like a lot of journals.plos.org failures diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md new file mode 100644 index 0000000..a991025 --- /dev/null +++ b/notes/ingest/2020-10_unpaywall.md @@ -0,0 +1,286 @@ + +New snapshot released 2020-10-09. Want to do a mostly straight-forward +load/ingest/crawl. + +Proposed changes this time around: + +- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture` + status, and to include those URLs in heritrix3 crawl +- tweak heritrix3 config for additional PDF URL extraction patterns, + particularly to improve OJS yield + + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json + => 28.3M 3:19:03 [2.37k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 28.3M 1:11:29 [ 6.6k/s] + => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2020-10-09' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + => COPY 4216339 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + => 4.22M 0:02:48 [ 25k/s] + +Start small, to test no-capture behavior: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +`no-capture` change looks good. Enqueue the whole batch: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 23661282 + no-capture | 3015447 + no-pdf-link | 2302102 + redirect-loop | 1542566 + terminal-bad-status | 1044676 + wrong-mimetype | 114315 + link-loop | 36358 + cdx-error | 20150 + null-body | 14513 + wayback-error | 13644 + gateway-timeout | 3776 + spn2-cdx-lookup-failure | 1260 + petabox-error | 1171 + redirects-exceeded | 752 + invalid-host-resolution | 464 + spn2-error | 147 + bad-redirect | 131 + spn2-error:job-failed | 91 + wayback-content-error | 45 + timeout | 19 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + ) t1 + ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json'; + => 2,936,404 + + # TODO: in the future also exclude "www.archive.org" + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json + +And actually dump seedlist(s): + + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt + + wc -l unpaywall_seedlist_2020-11-02.*.txt + 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt + 2713866 unpaywall_seedlist_2020-11-02.url.txt + +With things like jsessionid, suspect that crawling just the terminal URLs is +going to work better than both full and terminal. + +Finding a fraction of `no-capture` which have partial/stub URLs as terminal. + +TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP). + + +## Bulk Ingest and Status + +Note, removing archive.org links: + + cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Overall status (checked 2020-12-08): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 25004559 + no-pdf-link | 2531841 + redirect-loop | 1671375 + terminal-bad-status | 1389463 + no-capture | 893880 + wrong-mimetype | 119332 + link-loop | 66508 + wayback-content-error | 30339 + cdx-error | 21790 + null-body | 20710 + wayback-error | 13976 + gateway-timeout | 3775 + petabox-error | 2420 + spn2-cdx-lookup-failure | 1218 + redirects-exceeded | 889 + invalid-host-resolution | 464 + bad-redirect | 147 + spn2-error | 112 + spn2-error:job-failed | 91 + timeout | 21 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1101090 + accepted | no-pdf-link | 28590 + accepted | redirect-loop | 10923 + accepted | no-capture | 9540 + accepted | terminal-bad-status | 6339 + accepted | cdx-error | 952 + accepted | wrong-mimetype | 447 + accepted | link-loop | 275 + accepted | wayback-error | 202 + accepted | petabox-error | 177 + accepted | redirects-exceeded | 122 + accepted | null-body | 27 + accepted | wayback-content-error | 14 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | bad-redirect | 1 + published | success | 18595278 + published | no-pdf-link | 2434935 + published | redirect-loop | 1364110 + published | terminal-bad-status | 1185328 + published | no-capture | 718792 + published | wrong-mimetype | 112923 + published | link-loop | 63874 + published | wayback-content-error | 30268 + published | cdx-error | 17302 + published | null-body | 15209 + published | wayback-error | 10782 + published | gateway-timeout | 1966 + published | petabox-error | 1611 + published | spn2-cdx-lookup-failure | 879 + published | redirects-exceeded | 760 + published | invalid-host-resolution | 453 + published | bad-redirect | 115 + published | spn2-error:job-failed | 77 + published | spn2-error | 75 + published | timeout | 21 + published | bad-gzip-encoding | 5 + published | spn2-error:soft-time-limit-exceeded | 4 + published | spn2-error:pending | 1 + published | blocked-cookie | 1 + published | | 1 + published | pending | 1 + submitted | success | 5308166 + submitted | redirect-loop | 296322 + submitted | terminal-bad-status | 197785 + submitted | no-capture | 165545 + submitted | no-pdf-link | 68274 + submitted | wrong-mimetype | 5962 + submitted | null-body | 5474 + submitted | cdx-error | 3536 + submitted | wayback-error | 2992 + submitted | link-loop | 2359 + submitted | gateway-timeout | 1805 + submitted | petabox-error | 632 + submitted | spn2-cdx-lookup-failure | 334 + submitted | wayback-content-error | 57 + submitted | spn2-error | 37 + submitted | bad-redirect | 31 + submitted | spn2-error:job-failed | 14 + submitted | | 12 + submitted | invalid-host-resolution | 11 + submitted | redirects-exceeded | 7 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | bad-gzip-encoding | 1 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 11 + | no-capture | 3 + (70 rows) diff --git a/notes/ingest/2020-11-04_arxiv.md b/notes/ingest/2020-11-04_arxiv.md new file mode 100644 index 0000000..f9abe09 --- /dev/null +++ b/notes/ingest/2020-11-04_arxiv.md @@ -0,0 +1,12 @@ + +Ran a bulk dump using fatcat ingest tool several months ago, and had Martin run +a crawl. + +Crawl is now done, so going to ingest, hoping to get the majority of the +millions of remaining arxiv.org PDFs. + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | wc -l + => 1,288,559 + + zcat /grande/snapshots/fatcat_missing_arxiv_ingest_request.2020-08-21.json.gz | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md new file mode 100644 index 0000000..473dd0d --- /dev/null +++ b/notes/ingest/2020-11_doaj.md @@ -0,0 +1,295 @@ + +This is the first ingest (and crawl) of URLs from DOAJ article-level metadata. +It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in +the past. + +Working off a 2020-11-13 snapshot. + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:28 [4.57k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => ran in to error with blank `base_url` + +Second try after patches: + + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:29 [4.56k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036}) + +## Check Pre-Crawl Status + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-------------------------+--------- + pdf | | 3711532 + html | | 2429003 + pdf | success | 454403 + pdf | redirect-loop | 48587 + pdf | no-pdf-link | 24901 + pdf | no-capture | 11569 + xml | | 9442 + pdf | link-loop | 8466 + pdf | terminal-bad-status | 2015 + pdf | wrong-mimetype | 1441 + pdf | null-body | 1057 + pdf | petabox-error | 299 + pdf | cdx-error | 124 + pdf | gateway-timeout | 114 + pdf | wayback-error | 77 + pdf | spn2-cdx-lookup-failure | 20 + pdf | invalid-host-resolution | 4 + pdf | spn2-error | 1 + (18 rows) + +## Dump new URLs, Transform, Bulk Ingest (PDF and XML only) + +Dump: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.link_source = 'doaj' + -- AND date(ingest_request.created) > '2020-12-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json'; + => COPY 3732543 + +Transform: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json + => 3.73M 0:02:18 [26.9k/s] + +Definitely some non-URL strings in there; should try to filter those out +earlier in the transform process. And/or have a constraint on the URL column in +the database. + +Enqueue the whole batch: + + cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started this batch off at 2020-11-19 18:10 (Pacific time) + +Stats after run: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 30; + +## Dump Seedlist + +After preliminary bulk ingest attempts, dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json'; + => 1,899,555 + +TODO: filter for valid URLs + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json + +And actually dump seedlist(s): + + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt + + wc -l doaj_seedlist_2020-11-19.*.txt + +## Post-Crawl Ingest + +Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ +identifiers are all in fatcat: + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + # started 2020-12-23 15:05 (Pacific) + # finished around 2020-12-31, after one long/slow partition + +Stats again after everything: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+--------------------------+--------- + html | wrong-scope | 1089423 + html | no-capture | 423917 + html | redirect-loop | 212910 + html | unknown-scope | 204069 + html | html-resource-no-capture | 165587 + html | success | 122937 + html | null-body | 100296 + html | wayback-content-error | 53918 + html | wrong-mimetype | 18908 + html | terminal-bad-status | 14059 + html | petabox-error | 13520 + html | cdx-error | 6823 + html | wayback-error | 890 + html | | 620 + html | blocked-cookie | 543 + html | blocked-captcha | 250 + html | redirects-exceeded | 135 + html | too-many-resources | 111 + html | max-hops-exceeded | 84 + html | bad-redirect | 3 + pdf | success | 2851324 + pdf | no-pdf-link | 529914 + pdf | redirect-loop | 349494 + pdf | no-capture | 272202 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91796 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2968 + pdf | | 2068 + pdf | wayback-content-error | 1548 + pdf | cdx-error | 1095 + pdf | petabox-error | 1024 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | timeout | 20 + pdf | max-hops-exceeded | 19 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + xml | cdx-error | 3 + (43 rows) + + +And on filtered subset that we actually crawled: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+-----------------------+--------- + pdf | success | 2851286 + pdf | no-pdf-link | 527495 + pdf | redirect-loop | 345138 + pdf | no-capture | 268140 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91125 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2907 + pdf | petabox-error | 363 + pdf | wayback-content-error | 242 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | max-hops-exceeded | 19 + pdf | cdx-error | 15 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + (20 rows) + diff --git a/notes/ingest/2020-12-08_patch_crawl_notes.md b/notes/ingest/2020-12-08_patch_crawl_notes.md new file mode 100644 index 0000000..5979753 --- /dev/null +++ b/notes/ingest/2020-12-08_patch_crawl_notes.md @@ -0,0 +1,111 @@ + +Notes here about re-ingesting or re-crawling large batches. Goal around end of +2020 is to generate a broad patch crawl of terminal no-capture attempts for all +major sources crawled thus far. Have already tried run this process for unpaywall. + +For each, want filtered ingest request JSON objects (filtering out platforms +that don't crawl well, and possibly things like figshare+zenodo), and a broader +seedlist (including terminal URLs). Will de-dupe all the seedlist URLs and do a +heritrix crawl with new config, then re-ingest all the requests individually. + +Summary of what to do here: + + OA DOI: expecting some 2.4 million seeds + OAI-PMH: expecting some 5 million no-capture URLs, plus more from missing PDF URL not found + Unpaywall: another ~900k no-capture URLs (maybe filtered?) + +For all, re-attempt for these status codes: + + no-capture + cdx-error + wayback-error + petabox-error + gateway-timeout (?) + +And at least do bulk re-ingest for these, if updated before 2020-11-20 or so: + + no-pdf-link + +## OAI-PMH + +Need to re-ingest all of the (many!) no-capture and no-pdf-link + +TODO: repec-specific URL extraction? + +Skip these OAI prefixes: + + kb.dk + bnf.fr + hispana.mcu.es + bdr.oai.bsb-muenchen.de + ukm.si + hsp.org + +Skip these domains: + + www.kb.dk (kb.dk) + kb-images.kb.dk (kb.dk) + mdz-nbn-resolving.de (TODO: what prefix?) + aggr.ukm.um.si (ukm.si) + +Check PDF link extraction for these prefixes, or skip them (TODO): + + repec (mixed success) + biodiversitylibrary.org + juser.fz-juelich.de + americanae.aecid.es + www.irgrid.ac.cn + hal + espace.library.uq.edu.au + igi.indrastra.com + invenio.nusl.cz + hypotheses.org + t2r2.star.titech.ac.jp + quod.lib.umich.edu + + domain: hemerotecadigital.bne.es + domain: bib-pubdb1.desy.de + domain: publikationen.bibliothek.kit.edu + domain: edoc.mpg.de + domain: bibliotecadigital.jcyl.es + domain: lup.lub.lu.se + domain: orbi.uliege.be + +TODO: +- consider deleting ingest requests from skipped prefixes (large database use) + + +## Unpaywall + +About 900k `no-pdf-link`, and up to 2.5 million more `no-pdf-link`. + +Re-bulk-ingest filtered requests which hit `no-pdf-link` before 2020-11-20: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) < '2020-11-20' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + ) TO '/grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json'; + => COPY 1309990 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_nopdflink_2020-12-08.ingest_request.json + => 1.31M 0:00:51 [25.6k/s] + + cat /grande/snapshots/unpaywall_nopdflink_2020-12-08.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2021-04_unpaywall.md b/notes/ingest/2021-04_unpaywall.md new file mode 100644 index 0000000..d7643f4 --- /dev/null +++ b/notes/ingest/2021-04_unpaywall.md @@ -0,0 +1,368 @@ + +New snapshot released 2021-02-18, finally getting around to a crawl two months +later. + +Intend to do same style of crawl as in the past. One change is that +sandcrawler-db has moved to a focal VM. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18T160139.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json + => 30.0M 3:14:59 [2.57k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-02-18.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 30027007, 'insert-requests': 2703999, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 30027007, 'pushed': 30027007}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json'; + => COPY 3277484 + + # previous, 2020-10 run: COPY 4216339 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json + => 3.28M 0:01:42 [32.1k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-02-18.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 26385866 + no-pdf-link | 2132565 + no-capture | 2092111 + redirect-loop | 1732543 + terminal-bad-status | 1504555 + wayback-content-error | 357345 + wrong-mimetype | 126070 + link-loop | 76808 + cdx-error | 22756 + null-body | 22066 + wayback-error | 13768 + gateway-timeout | 3804 + petabox-error | 3608 + spn2-cdx-lookup-failure | 1225 + redirects-exceeded | 892 + invalid-host-resolution | 505 + bad-redirect | 151 + spn2-error | 108 + spn2-error:job-failed | 91 + bad-gzip-encoding | 27 + (20 rows) + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-01-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 1348623 + no-capture | 1231582 + redirect-loop | 45622 + no-pdf-link | 37312 + terminal-bad-status | 24162 + wrong-mimetype | 6684 + link-loop | 5757 + null-body | 1288 + wayback-content-error | 1123 + cdx-error | 831 + petabox-error | 697 + wayback-error | 185 + invalid-host-resolution | 41 + gateway-timeout | 29 + blocked-cookie | 22 + bad-gzip-encoding | 20 + spn2-cdx-lookup-failure | 7 + bad-redirect | 4 + timeout | 3 + redirects-exceeded | 3 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json'; + => 2020-10: 2,936,404 + => 2021-04: 1,805,192 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json + => 1.81M 0:01:27 [20.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.*.txt + 6 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.no_terminal_url.txt + 1668524 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.terminal_url.txt + 1685717 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-02-18.url.txt + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-02-18.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1,804,211 consumer group lag + +## Post-Ingest Stats + +Overall status (unpaywall, all time): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 27242251 + no-pdf-link | 2746237 + redirect-loop | 1821132 + terminal-bad-status | 1553441 + no-capture | 478559 + wayback-content-error | 357390 + wrong-mimetype | 127365 + link-loop | 79389 + cdx-error | 23170 + null-body | 23169 + wayback-error | 13704 + gateway-timeout | 3803 + petabox-error | 3642 + redirects-exceeded | 1427 + spn2-cdx-lookup-failure | 1214 + invalid-host-resolution | 505 + bad-redirect | 153 + spn2-error | 107 + spn2-error:job-failed | 91 + body-too-large | 84 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1213335 + accepted | no-pdf-link | 29292 + accepted | redirect-loop | 12769 + accepted | terminal-bad-status | 11264 + accepted | no-capture | 10187 + accepted | cdx-error | 1015 + accepted | wayback-content-error | 757 + accepted | wrong-mimetype | 501 + accepted | link-loop | 407 + accepted | wayback-error | 207 + accepted | petabox-error | 189 + accepted | redirects-exceeded | 125 + accepted | null-body | 34 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | blocked-cookie | 2 + accepted | bad-redirect | 1 + accepted | body-too-large | 1 + published | success | 20196774 + published | no-pdf-link | 2647969 + published | redirect-loop | 1477558 + published | terminal-bad-status | 1320013 + published | wayback-content-error | 351931 + published | no-capture | 297603 + published | wrong-mimetype | 115440 + published | link-loop | 76431 + published | cdx-error | 18125 + published | null-body | 17559 + published | wayback-error | 10466 + published | petabox-error | 2684 + published | gateway-timeout | 1979 + published | redirects-exceeded | 947 + published | spn2-cdx-lookup-failure | 877 + published | invalid-host-resolution | 457 + published | bad-redirect | 120 + published | spn2-error:job-failed | 77 + published | spn2-error | 70 + published | body-too-large | 39 + published | bad-gzip-encoding | 24 + published | timeout | 24 + published | blocked-cookie | 23 + published | spn2-error:soft-time-limit-exceeded | 4 + published | | 2 + published | pending | 1 + published | spn2-error:pending | 1 + published | too-many-redirects | 1 + submitted | success | 5832117 + submitted | redirect-loop | 330785 + submitted | terminal-bad-status | 222152 + submitted | no-capture | 170766 + submitted | no-pdf-link | 68934 + submitted | wrong-mimetype | 11424 + submitted | null-body | 5576 + submitted | wayback-content-error | 4702 + submitted | cdx-error | 4030 + submitted | wayback-error | 3031 + submitted | link-loop | 2551 + submitted | gateway-timeout | 1820 + submitted | petabox-error | 769 + submitted | redirects-exceeded | 355 + submitted | spn2-cdx-lookup-failure | 332 + submitted | invalid-host-resolution | 48 + submitted | body-too-large | 44 + submitted | spn2-error | 37 + submitted | bad-redirect | 32 + submitted | spn2-error:job-failed | 14 + submitted | | 13 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | timeout | 4 + submitted | bad-gzip-encoding | 3 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 12 + | no-capture | 3 + (76 rows) + + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2192376 + no-capture | 152183 + no-pdf-link | 144174 + redirect-loop | 125988 + terminal-bad-status | 67307 + link-loop | 8292 + wrong-mimetype | 7942 + null-body | 2270 + cdx-error | 1223 + wayback-content-error | 1147 + petabox-error | 728 + wayback-error | 155 + body-too-large | 82 + invalid-host-resolution | 41 + gateway-timeout | 28 + blocked-cookie | 22 + bad-gzip-encoding | 20 + timeout | 7 + bad-redirect | 6 + redirects-exceeded | 4 + (20 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 2,703,999 raw ingest requests (new URLs total) +- 1,231,582 (45.5%) of these had not been seen/crawled from any source yet +- 843,753 (31.2%) success from new heritrix crawling +- 2,192,376 (81.1%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) diff --git a/notes/ingest/2021-05_daily_improvements.md b/notes/ingest/2021-05_daily_improvements.md new file mode 100644 index 0000000..e8748fa --- /dev/null +++ b/notes/ingest/2021-05_daily_improvements.md @@ -0,0 +1,480 @@ + +Summary of top large broken domains (2021-04-21 "30 day" snapshot): + +## acervus.unicamp.br + + domain | status | count +---------------------------------------+-------------------------+-------- + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%acervus.unicamp.br%' and status = 'no-pdf-link' limit 5; + +http://acervus.unicamp.br/index.asp?codigo_sophia=963332 + +seems like many of these were captures with a blank page? or a redirect to +the homepage? + +http://web.archive.org/web/20200129110523/http://acervus.unicamp.br/index.html + +messy, going to move on. + + +## apex.ipk-gatersleben.de + +apex.ipk-gatersleben.de | | 1253 +apex.ipk-gatersleben.de | no-pdf-link | 1132 + +select * from ingest_file_result where updated >= '2021-03-01' and terminal_url like '%apex.ipk-gatersleben.de%' and status = 'no-pdf-link' limit 5; + +https://doi.org/10.25642/ipk/rescoll/4886 +https://apex.ipk-gatersleben.de/apex/f?p=PGRDOI:RESOLVE:::NO:RP:DOI:10.25642/IPK/RESCOLL/7331 + +seem to be datasets/species, not articles. + +prefix: 10.25642/ipk + +## crossref.org + + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + +https://doi.org/10.1515/9781501747045-013 +https://apps.crossref.org/coaccess/coaccess.html?doi=10.1515%2F9781501747045-013 + +Derp, they are doing a dynamic/AJAX thing, so access links are not in the HTML. + +## openeditiong + + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + +https://doi.org/10.4000/books.pul.34492 +https://books.openedition.org/pul/34492 + +these are not actually OA books (or at least, not all are) + +## chemrxiv.org (figshare) + + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + +https://doi.org/10.26434/chemrxiv.14411081 +https://chemrxiv.org/articles/preprint/Prediction_and_Optimization_of_Ion_Transport_Characteristics_in_Nanoparticle-Based_Electrolytes_Using_Convolutional_Neural_Networks/14411081 + +these all seem to be *multi-file* entities, thus not good for single file ingest pipeline. + +## direct.mit.edu + + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + +https://doi.org/10.7551/mitpress/14056.003.0004 +https://direct.mit.edu/books/monograph/5111/chapter-abstract/3060134/Adding-Technology-to-Contact-Tracing?redirectedFrom=fulltext + +"not available" + +https://doi.org/10.7551/mitpress/12444.003.0004 + +"not available" + + +## dlc.library.columbia.edu + + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + +https://doi.org/10.7916/d8-506w-kk49 +https://dlc.library.columbia.edu/durst/cul:18931zcrk9 + +document repository. +this one goes to IA! actually many seem to. +added extractor, should re-ingest with: + + publisher:"Columbia University" doi_prefix:10.7916 !journal:* + +actually, that is like 600k+ results and many are not digitized, so perhaps not. + +## doi.ala.org.au + + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + +https://doi.org/10.26197/ala.811d55e3-2ff4-4501-b3e7-e19249507052 +https://doi.ala.org.au/doi/811d55e3-2ff4-4501-b3e7-e19249507052 + +this is a data repository, with filesets, not papers. datacite metadata is +incorrect. + +## fldeploc.dep.state.fl.us + + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + + +https://doi.org/10.35256/ic29 +http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29 + +re-ingest with: + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + +## geoscan.nrcan.gc.ca + + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + +https://doi.org/10.4095/295366 +https://geoscan.nrcan.gc.ca/starweb/geoscan/servlet.starweb?path=geoscan/fulle.web&search1=R=295366 + +this is a geographic repository, not papers. + +## kiss.kstudy.com + + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + +https://doi.org/10.22143/hss21.12.1.121 +http://kiss.kstudy.com/thesis/thesis-view.asp?key=3862523 + +Korean. seems to not actually be theses? can't download. + +## linkinghub.elsevier.com + + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + +skipping for now, looks like mostly 'forbidden'? + +## osf.io + +These are important! + + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + +https://doi.org/10.31219/osf.io/jux3w +https://accounts.osf.io/login?service=https://osf.io/jux3w/download + +many of these are 404s by browser as well. what does that mean? + +## peerj.com + + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + +https://doi.org/10.7287/peerj.11155v0.1/reviews/2 +https://peerj.com/articles/11155/reviews/ + +these are HTML reviews, not papers + +## preprints.jmir.org + + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + +https://doi.org/10.2196/preprints.22556 +https://preprints.jmir.org/preprint/22556 + +UGH, looks simple, but javascript. + +could try to re-write URL into S3 format? meh. + +## psyarxiv.com (OSF?) + + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + +https://doi.org/10.31234/osf.io/5jaqg +https://psyarxiv.com/5jaqg/ + +Also infuriatingly Javascript, but can do URL hack. + +Should reingest, and potentially force-recrawl: + + # about 67k + publisher:"Center for Open Science" in_ia:false + +## publons.com + + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + +https://doi.org/10.1002/jmor.21338/v2/review1 +https://publons.com/publon/40260824/ + +These are just HTML reviews, not papers. + +## saemobilus.sae.org + + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + +https://doi.org/10.4271/as1426c +https://saemobilus.sae.org/content/as1426c + +These seem to be standards, and are not open access (paywall) + +## scholar.dkyobobook.co.kr + + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + +https://doi.org/10.22471/crisis.2021.6.1.18 +http://scholar.dkyobobook.co.kr/searchDetail.laf?barcode=4010028199536 + +Korean. complex javascript, skipping. + +## unreserved.rba.gov.au + + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + +https://doi.org/10.47688/rba_archives_2006/04129 +https://unreserved.rba.gov.au/users/login + +Don't need to login when I tried in browser? document repo, not papers. + +## wayf.switch.ch + + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + +https://doi.org/10.24451/arbor.11128 +https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Farbor.bfh.ch%2Fshibboleth&return=https%3A%2F%2Farbor.bfh.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A5056fc0a97aeab16e5007ca63bede254cb5669d94173064d6c74c62a0f88b022 + +Loginwall + +## + + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + +https://doi.org/10.5040/9781849664264.0008 +https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries + +These are primarily not OA/available. + +## + + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + +https://doi.org/10.22443/rms.emc2020.146 +https://www.emc2020.eu/abstract/evaluation-of-different-rectangular-scan-strategies-for-hrstem-imaging.html + +These are just abstracts, not papers. + +## Emerald + + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + +https://doi.org/10.1108/ramj-11-2020-0065 +https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html + +Note that these URLs are already HTML fulltext. but the PDF is also available and easy. + +re-ingest: + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + +## + + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + +https://doi.org/10.5040/9781718206625.ch-002 +https://www.humankineticslibrary.com/encyclopedia-chapter?docid=b-9781718206625&tocid=b-9781718206625-chapter2 + +paywall + +## + + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + +https://doi.org/10.1504/ijdmb.2020.10036342 +https://www.inderscience.com/info/ingeneral/forthcoming.php?jcode=ijdmb + +paywall + +## + + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + +https://doi.org/10.15258/sst.2021.49.1.07 +https://www.ingentaconnect.com/content/ista/sst/pre-prints/content-7_sst.2021.49.1_63-71;jsessionid=1joc5mmi1juht.x-ic-live-02 + +Annoying javascript, but easy to work around. + +re-ingest: + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + +## + + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + +https://doi.org/10.5771/9783748907084-439 +https://www.nomos-elibrary.de/10.5771/9783748907084-439/verzeichnis-der-autorinnen-und-autoren + +Javascript obfuscated download button? + +## + + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + +https://doi.org/10.1787/543e84ed-en +https://www.oecd-ilibrary.org/development/applying-evaluation-criteria-thoughtfully_543e84ed-en + +Paywall. + +## + + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + +https://doi.org/10.1364/boe.422199 +https://www.osapublishing.org/boe/abstract.cfm?doi=10.1364/BOE.422199 + +Some of these are "pre-registered" DOIs, not published yet. Many of the +remaining are actually HTML articles, and/or have some stuff in the +`citation_pdf_url`. A core problem is captchas. + +Have started adding support to fatcat for HTML crawl type based on container. + +re-ingest: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + +## + + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + +https://doi.org/10.1093/oseo/instance.00266789 +https://www.oxfordscholarlyeditions.com/view/10.1093/actrade/9780199593668.book.1/actrade-9780199593668-div1-27 + +loginwall/paywall + +## + + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + +https://doi.org/10.1127/zfg/40/1996/461 +https://www.schweizerbart.de/papers/zfg/detail/40/97757/Theoretical_model_of_surface_karstic_processes?af=crossref + +paywall + +## + + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + +https://doi.org/10.1016/j.landurbplan.2021.104104 +https://www.sciencedirect.com/science/article/pii/S0169204621000670 + +Bunch of crazy new hacks, but seems to be working! + +re-ingest: + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2021 + +## + + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + +https://doi.org/10.2478/awutm-2019-0012 +https://www.sciendo.com/article/10.2478/awutm-2019-0012 + +uses lots of javascript, hard to scrape. + + +## Others (for reference) + + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 + +## Re-ingests + +All the above combined: + + container_twtpsm6ytje3nhuqfu3pa7ca7u (optica) + container_cg4vcsfty5dfvgmat5wm62wgie (optics express) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id twtpsm6ytje3nhuqfu3pa7ca7u + => Counter({'ingest_request': 1142, 'elasticsearch_release': 1142, 'estimate': 1142, 'kafka': 1142}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --ingest-type html container --container-id cg4vcsfty5dfvgmat5wm62wgie + => Counter({'elasticsearch_release': 33482, 'estimate': 33482, 'ingest_request': 32864, 'kafka': 32864}) + + # only ~800 works + doi_prefix:10.35256 publisher:Florida + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa query "doi_prefix:10.35256 publisher:Florida" + => Counter({'ingest_request': 843, 'elasticsearch_release': 843, 'estimate': 843, 'kafka': 843}) + + # only ~3k or so missing + doi_prefix:10.1108 publisher:emerald in_ia:false is_oa:true + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1108 publisher:emerald" + => Counter({'ingest_request': 3812, 'elasticsearch_release': 3812, 'estimate': 3812, 'kafka': 3812}) + + + # only a couple hundred; also re-ingest + doi_prefix:10.15258 in_ia:false year:>2018 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --allow-non-oa --force-recrawl query "doi_prefix:10.15258 year:>2018" + => Counter({'ingest_request': 140, 'elasticsearch_release': 140, 'estimate': 140, 'kafka': 140}) + + # to start! about 50k + doi_prefix:10.1016 is_oa:true year:2020 + doi_prefix:10.1016 is_oa:true year:2021 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2020" + => Counter({'ingest_request': 75936, 'elasticsearch_release': 75936, 'estimate': 75936, 'kafka': 75936}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org query "doi_prefix:10.1016 year:2021" + => Counter({'ingest_request': 54824, 'elasticsearch_release': 54824, 'estimate': 54824, 'kafka': 54824}) + + pmcid:* year:2018 + pmcid:* year:2019 + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2018" + => Counter({'ingest_request': 25366, 'elasticsearch_release': 25366, 'estimate': 25366, 'kafka': 25366}) + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc263.us.archive.org --force-recrawl query "pmcid:* year:2019" + => Counter({'ingest_request': 55658, 'elasticsearch_release': 55658, 'estimate': 55658, 'kafka': 55658}) + diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md new file mode 100644 index 0000000..8b6ac09 --- /dev/null +++ b/notes/ingest/2021-07_unpaywall.md @@ -0,0 +1,320 @@ + +New snapshot released 2021-07-02. Should be "boring" ingest and crawl. + + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02T151134.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json + => 32.2M 3:01:52 [2.95k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_2021-07-02.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 32196260, 'insert-requests': 3325954, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 32196260, 'pushed': 32196260}) + + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + => COPY 3556146 + + # previous, 2020-10 run: COPY 4216339 + # previous, 2021-07 run: COPY 3277484 + +Oops, should have run instead, with the date filter: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json'; + +But didn't, so processed all instead. + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json + => 3.56M 0:01:59 [29.8k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-02.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done, on 2021-07-13 + + +## Check Pre-Crawl Status + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 1831827 + success | 1343604 + redirect-loop | 103999 + terminal-bad-status | 19845 + no-pdf-link | 17448 + link-loop | 5027 + wrong-mimetype | 2270 + cdx-error | 523 + body-too-large | 321 + null-body | 298 + wayback-content-error | 242 + petabox-error | 155 + gateway-timeout | 138 + invalid-host-resolution | 120 + wayback-error | 109 + blocked-cookie | 9 + timeout | 7 + | 3 + bad-redirect | 3 + spn2-cdx-lookup-failure | 3 + (20 rows) + + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND date(ingest_request.created) > '2021-07-01' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json'; + => COPY 1743186 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json + => 1.74M 0:01:33 [18.6k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.*.txt + 1 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.no_terminal_url.txt + 1643963 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.terminal_url.txt + 1644028 /srv/sandcrawler/tasks/unpaywall_seedlist_2021-07-02.url.txt + 3287992 total + +Then run crawl (see `journal-crawls` git repo). + +## Post-Crawl Bulk Ingest + + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1.74M 0:01:59 [14.6k/s] + +## Post-Ingest Stats + +Only the recent updates: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 2690258 + redirect-loop | 227328 + no-capture | 157368 + terminal-bad-status | 118943 + no-pdf-link | 92698 + blocked-cookie | 19478 + link-loop | 9249 + wrong-mimetype | 4918 + cdx-error | 1786 + wayback-error | 1497 + null-body | 1302 + body-too-large | 433 + wayback-content-error | 245 + petabox-error | 171 + gateway-timeout | 138 + invalid-host-resolution | 120 + timeout | 12 + bad-redirect | 4 + | 3 + spn2-cdx-lookup-failure | 1 + (20 rows) + +Only the recent updates, by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + release_stage | status | count + ---------------+-------------------------+--------- + accepted | success | 103144 + accepted | no-pdf-link | 53981 + accepted | terminal-bad-status | 4102 + accepted | link-loop | 2799 + accepted | no-capture | 2315 + accepted | redirect-loop | 2171 + accepted | blocked-cookie | 234 + accepted | cdx-error | 140 + accepted | wayback-error | 101 + accepted | wrong-mimetype | 38 + accepted | null-body | 10 + accepted | petabox-error | 5 + accepted | wayback-content-error | 4 + accepted | gateway-timeout | 2 + accepted | body-too-large | 2 + published | success | 1919100 + published | no-capture | 130104 + published | redirect-loop | 127482 + published | terminal-bad-status | 43118 + published | no-pdf-link | 33505 + published | blocked-cookie | 19034 + published | link-loop | 6241 + published | wrong-mimetype | 4163 + published | null-body | 1195 + published | cdx-error | 1151 + published | wayback-error | 1105 + published | wayback-content-error | 197 + published | body-too-large | 195 + published | petabox-error | 118 + published | gateway-timeout | 35 + published | invalid-host-resolution | 13 + published | timeout | 8 + published | bad-redirect | 2 + published | spn2-cdx-lookup-failure | 1 + published | bad-gzip-encoding | 1 + submitted | success | 668014 + submitted | redirect-loop | 97675 + submitted | terminal-bad-status | 71723 + submitted | no-capture | 24949 + submitted | no-pdf-link | 5212 + submitted | wrong-mimetype | 717 + submitted | cdx-error | 495 + submitted | wayback-error | 291 + submitted | body-too-large | 236 + submitted | blocked-cookie | 210 + submitted | link-loop | 209 + submitted | invalid-host-resolution | 107 + submitted | gateway-timeout | 101 + submitted | null-body | 97 + submitted | petabox-error | 48 + submitted | wayback-content-error | 44 + submitted | timeout | 4 + submitted | | 3 + submitted | bad-redirect | 2 + submitted | remote-server-error | 1 + (55 rows) + +In total, this iteration of unpaywall ingest resulted in: + +- 3,325,954 raw ingest requests (new URLs total) +- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl +- 1,346,654 (77% of crawled) success from new heritrix crawling +- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) + +## Live Ingest Follow-Up + +Will run SPN requests on the ~160k `no-capture` URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json'; + => COPY 157371 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json + => 157k 0:00:04 [31.6k/s] + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md new file mode 100644 index 0000000..5f92196 --- /dev/null +++ b/notes/ingest/2021-08_mag.md @@ -0,0 +1,400 @@ + +Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest. +Also want to re-ingest some old/failed ingests, now that pipeline/code has +improved. + +Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs. + + +## Persist Ingest Requests + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000}) + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request - + => 22.5M 0:46:00 [8.16k/s] + => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585}) + +Roughly 8.6 million new URLs + +## Pre-Crawl Status Counts + +Status of combined old and new requests, with some large domains removed: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + success | 26123975 + | 6664846 + no-pdf-link | 1859908 + redirect-loop | 1532405 + no-capture | 1199126 + link-loop | 1157010 + terminal-bad-status | 832362 + gateway-timeout | 202158 + spn2-cdx-lookup-failure | 81406 + wrong-mimetype | 69087 + invalid-host-resolution | 37262 + wayback-error | 21340 + petabox-error | 11237 + null-body | 9414 + wayback-content-error | 2199 + cdx-error | 1893 + spn2-error | 1741 + spn2-error:job-failed | 971 + blocked-cookie | 902 + spn2-error:invalid-url-syntax | 336 + (20 rows) + +And just the new URLs (note that domain filter shouldn't be required, but +keeping for consistency): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + | 6664780 + success | 1957844 + redirect-loop | 23357 + terminal-bad-status | 9385 + no-pdf-link | 8315 + no-capture | 6892 + link-loop | 4517 + wrong-mimetype | 3864 + cdx-error | 1749 + blocked-cookie | 842 + null-body | 747 + wayback-error | 688 + wayback-content-error | 570 + gateway-timeout | 367 + petabox-error | 340 + spn2-cdx-lookup-failure | 150 + read-timeout | 122 + not-found | 119 + invalid-host-resolution | 63 + spn2-error | 23 + (20 rows) + +## Dump Initial Bulk Ingest Requests + +Note that this is all-time, not just recent, and will re-process a lot of +"no-pdf-link": + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-pdf-link' + OR ingest_file_result.status = 'cdx-error' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json'; + => COPY 8526647 + +Transform to ingest requests: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json + => 8.53M 0:03:40 + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + +Updated stats after running initial bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 5184994 + no-capture | 3284416 + redirect-loop | 98685 + terminal-bad-status | 28733 + link-loop | 28518 + blocked-cookie | 22338 + no-pdf-link | 19073 + wrong-mimetype | 9122 + null-body | 2793 + wayback-error | 2128 + wayback-content-error | 1233 + cdx-error | 1198 + petabox-error | 617 + gateway-timeout | 395 + not-found | 130 + read-timeout | 128 + | 111 + invalid-host-resolution | 63 + spn2-cdx-lookup-failure | 24 + spn2-error | 20 + (20 rows) + +## Generate Seedlist + +For crawling, do a similar (but not identical) dump: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json'; + => COPY 4599519 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json + => 4.60M 0:02:55 [26.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + => DONE + + wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt + 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + +## Post-Crawl Bulk Re-Ingest + +Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by +hash, URL agnostic). + +Enqueue for buik re-ingest: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => Thu 19 Aug 2021 09:10:59 PM UTC + + +## Post-Ingest Stats + +Just the new stuff (compare against above for delta): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 7748241 89.2% + no-capture | 429688 4.9% + redirect-loop | 172831 2.0% + terminal-bad-status | 94029 1.1% + no-pdf-link | 86437 1.0% + blocked-cookie | 67903 0.8% + link-loop | 50622 + wrong-mimetype | 21064 + null-body | 6650 + cdx-error | 3313 + wayback-error | 2630 + gateway-timeout | 399 + petabox-error | 268 + wayback-content-error | 170 + not-found | 130 + read-timeout | 128 + | 109 + invalid-host-resolution | 63 + bad-redirect | 39 + spn2-error | 20 + (20 rows) + +New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397 + +Overall success of new batch: 7748241. / 8686315 = 89.2% + +And combined (old and new) status again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 31990062 + redirect-loop | 1704717 + no-capture | 1263462 + link-loop | 1218280 + blocked-cookie | 1213838 + no-pdf-link | 1096664 + terminal-bad-status | 960070 + gateway-timeout | 202190 + wrong-mimetype | 86557 + invalid-host-resolution | 37262 + null-body | 15443 + wayback-error | 12839 + cdx-error | 4047 + spn2-error | 1731 + spn2-error:job-failed | 962 + petabox-error | 463 + wayback-content-error | 379 + spn2-error:invalid-url-syntax | 336 + spn2-error:soft-time-limit-exceeded | 203 + | 175 + (20 rows) + +New success total: 31990062 - 26123975 = 5,866,087 + +A full 1,263,462 no-capture that could be attempted... though many of those may +be excluded for a specific reason. diff --git a/notes/ingest/2021-09-02_oai_pmh_patch.md b/notes/ingest/2021-09-02_oai_pmh_patch.md new file mode 100644 index 0000000..ac808dd --- /dev/null +++ b/notes/ingest/2021-09-02_oai_pmh_patch.md @@ -0,0 +1,1578 @@ + +Just a "patch" of previous OAI-PMH crawl/ingest: re-ingesting and potentially +re-crawling content which failed to ingest the first time. + +May fold this in with more general patch crawling. + +## Basic Counts + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 14145387 + no-pdf-link | 12063022 + no-capture | 5485640 + redirect-loop | 2092705 + terminal-bad-status | 747372 + wrong-mimetype | 597219 + link-loop | 542144 + null-body | 93566 + cdx-error | 19798 + petabox-error | 17943 + | 15283 + wayback-error | 13897 + gateway-timeout | 511 + skip-url-blocklist | 184 + wayback-content-error | 146 + bad-redirect | 137 + redirects-exceeded | 120 + bad-gzip-encoding | 116 + timeout | 80 + blocked-cookie | 64 + (20 rows) + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 40; + + + oai_prefix | success | total + ---------------------------+---------+--------- + repec | 1133175 | 2783448 + hal | 573218 | 1049607 + www.irgrid.ac.cn | 18007 | 748828 + cds.cern.ch | 74078 | 688091 + americanae.aecid.es | 71310 | 572792 + juser.fz-juelich.de | 23026 | 518551 + espace.library.uq.edu.au | 6649 | 508960 + igi.indrastra.com | 59629 | 478577 + archive.ugent.be | 65306 | 424014 + hrcak.srce.hr | 404085 | 414897 + zir.nsk.hr | 156753 | 397200 + renati.sunedu.gob.pe | 79362 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7997 | 354529 + generic.eprints.org | 263566 | 340470 + invenio.nusl.cz | 6340 | 325867 + evastar-karlsruhe.de | 62282 | 317952 + quod.lib.umich.edu | 5 | 309135 + diva.org | 67917 | 298348 + t2r2.star.titech.ac.jp | 1085 | 289388 + edpsciences.org | 139495 | 284972 + repository.ust.hk | 10245 | 283417 + revues.org | 151156 | 277497 + pure.atira.dk | 13492 | 260754 + bibliotecadigital.jcyl.es | 50606 | 254134 + escholarship.org/ark | 140835 | 245203 + ojs.pkp.sfu.ca | 168029 | 229387 + lup.lub.lu.se | 49358 | 226602 + library.wur.nl | 15051 | 216738 + digitalrepository.unm.edu | 111704 | 211749 + infoscience.tind.io | 60166 | 207299 + edoc.mpg.de | 0 | 205252 + erudit.org | 168490 | 197803 + delibra.bg.polsl.pl | 38666 | 196652 + n/a | 0 | 193814 + aleph.bib-bvb.de | 4349 | 186666 + serval.unil.ch | 41643 | 186372 + orbi.ulg.ac.be | 2400 | 184551 + digitalcommons.unl.edu | 144025 | 184372 + bib-pubdb1.desy.de | 33525 | 182717 + (40 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 50; + + oai_prefix | status | count + ---------------------------+---------------+--------- + repec | success | 1133175 + repec | no-pdf-link | 638105 + hal | success | 573218 + cds.cern.ch | no-capture | 540380 + repec | redirect-loop | 516451 + juser.fz-juelich.de | no-pdf-link | 477881 + americanae.aecid.es | no-pdf-link | 417766 + hrcak.srce.hr | success | 404085 + www.irgrid.ac.cn | no-pdf-link | 370908 + hal | no-pdf-link | 359252 + www.irgrid.ac.cn | no-capture | 355532 + espace.library.uq.edu.au | no-pdf-link | 320479 + igi.indrastra.com | no-pdf-link | 318242 + repec | no-capture | 316981 + invenio.nusl.cz | no-pdf-link | 309802 + rour.neicon.ru | redirect-loop | 300911 + hypotheses.org | no-pdf-link | 300251 + renati.sunedu.gob.pe | no-capture | 282800 + t2r2.star.titech.ac.jp | no-pdf-link | 272045 + generic.eprints.org | success | 263566 + quod.lib.umich.edu | no-pdf-link | 259661 + archive.ugent.be | no-capture | 256127 + evastar-karlsruhe.de | no-pdf-link | 248939 + zir.nsk.hr | link-loop | 226919 + repository.ust.hk | no-pdf-link | 208569 + edoc.mpg.de | no-pdf-link | 199758 + bibliotecadigital.jcyl.es | no-pdf-link | 188433 + orbi.ulg.ac.be | no-pdf-link | 172373 + diva.org | no-capture | 171115 + lup.lub.lu.se | no-pdf-link | 168652 + erudit.org | success | 168490 + ojs.pkp.sfu.ca | success | 168029 + lib.dr.iastate.edu | success | 158494 + zir.nsk.hr | success | 156753 + digital.kenyon.edu | success | 154900 + revues.org | success | 151156 + books.openedition.org | no-pdf-link | 149607 + freidok.uni-freiburg.de | no-pdf-link | 146837 + digitalcommons.unl.edu | success | 144025 + escholarship.org/ark | success | 140835 + culeuclid | link-loop | 140291 + edpsciences.org | success | 139495 + serval.unil.ch | no-pdf-link | 138644 + bib-pubdb1.desy.de | no-pdf-link | 133815 + krm.or.kr | no-pdf-link | 132461 + pure.atira.dk | no-pdf-link | 132179 + oai-gms.dimdi.de | redirect-loop | 131409 + aleph.bib-bvb.de | no-capture | 128261 + library.wur.nl | no-pdf-link | 124718 + lirias2repo.kuleuven.be | no-capture | 123106 + (50 rows) + +Note: could just delete the "excluded" rows? and not harvest them in the +future, and filter them at ingest time (in transform script). + + + +## Investigate no-pdf-link sandcrawler improvements + +Do some spot-sampling of 'no-pdf-link' domains, see if newer sandcrawler works: + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source_id LIKE 'oai:library.wur.nl:%' + ORDER BY random() + LIMIT 10; + +Random sampling of *all* 'no-pdf-link' URLs (see if newer sandcrawler works): + + \x auto + + SELECT + ingest_request.link_source_id AS oai_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_file_result.status = 'no-pdf-link' + ORDER BY random() + LIMIT 30; + +### repec (SKIP-PREFIX) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jmacro:v:54:y:2017:i:pb:p:332-351 +base_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +terminal_url | http://www.sciencedirect.com/science/article/pii/S0164070417301593 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:eee:jomega:v:16:y:1988:i:2:p:107-115 +base_url | http://www.sciencedirect.com/science/article/pii/0305-0483(88)90041-2 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/0305048388900412 +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------- +-------------------------------------- +oai_id | oai:repec:sgm:pzwzuw:v:14:i:59:y:2016:p:73-92 +base_url | http://pz.wz.uw.edu.pl/en +terminal_url | http://pz.wz.uw.edu.pl:80/en +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:erv:rccsrc:y:2016:i:2016_11:35 +base_url | http://www.eumed.net/rev/caribe/2016/11/estructura.html +terminal_url | http://www.eumed.net:80/rev/caribe/2016/11/estructura.html +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:pio:envira:v:33:y:2001:i:4:p:629-647 +base_url | http://www.envplan.com/epa/fulltext/a33/a3319.pdf +terminal_url | http://uk.sagepub.com:80/en-gb/eur/pion-journals-published +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repec:tpr:qjecon:v:100:y:1985:i:3:p:651-75 +base_url | http://links.jstor.org/sici?sici=0033-5533%28198508%29100%3A3%3C651%3ATCOCEA%3E2.0.CO%3B2-2&origin=repec +terminal_url | https://www.jstor.org/stable/1884373 + +Huh! This is just a catalog of other domains. Should probably skip + +DONE: skip/filter repec + +### juser.fz-juelich.de (SCOPE) + +-[ RECORD 1 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:132217 +base_url | http://juser.fz-juelich.de/record/132217 +terminal_url | http://juser.fz-juelich.de/record/132217 + +Poster; no files. + +-[ RECORD 2 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:268598 +base_url | http://juser.fz-juelich.de/record/268598 +terminal_url | http://juser.fz-juelich.de/record/268598 + +Journal. + +-[ RECORD 3 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:126613 +base_url | http://juser.fz-juelich.de/record/126613 +terminal_url | http://juser.fz-juelich.de/record/126613 + +-[ RECORD 4 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:67362 +base_url | http://juser.fz-juelich.de/record/67362 +terminal_url | http://juser.fz-juelich.de/record/67362 +-[ RECORD 5 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:869189 +base_url | http://juser.fz-juelich.de/record/869189 +terminal_url | http://juser.fz-juelich.de/record/869189 +-[ RECORD 6 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:810746 +base_url | http://juser.fz-juelich.de/record/810746 +terminal_url | http://juser.fz-juelich.de/record/810746 +-[ RECORD 7 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:52897 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-52897%22 +-[ RECORD 8 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:114755 +base_url | http://juser.fz-juelich.de/record/114755 +terminal_url | http://juser.fz-juelich.de/record/114755 +-[ RECORD 9 ]+------------------------------------------------------------ +oai_id | oai:juser.fz-juelich.de:58025 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-58025%22 + +The search URLs seem redundant? Not going to try to handle those. + +"Powered by Invenio v1.1.7" + +All of these examples seem to be not papers. Maybe we can filter these better +at the harvest or transform stage? + +### americanae.aecid.es (MIXED) + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:502896 +base_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai +terminal_url | http://biblioteca.clacso.edu.ar/gsdl/cgi-bin/library.cgi?a=d&c=mx/mx-010&d=60327292007oai + +just a metadata record? links to redalyc + +METADATA-ONLY + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:534600 +base_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +terminal_url | http://bdh-rd.bne.es/viewer.vm?id=0000077778&page=1 +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:524567 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=524567 + +NOT-FOUND (404) + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:378914 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=378914 + +Some single-page image archival thing? bespoke, skipping. + +SKIP-BESPOKE + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:526142 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=526142 + +NOT-FOUND (404) + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------- +oai_id | oai:americanae.aecid.es:373408 +base_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 +terminal_url | http://americanae.aecid.es/americanae/es/registros/registro.do?tipoRegistro=MTD&idBib=373408 + +NOT-FOUND (404) + +### www.irgrid.ac.cn (SKIP-PREFIX) + +Chinese Academy of Sciences Institutional Repositories Grid + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1749980 +base_url | http://www.irgrid.ac.cn/handle/1471x/1749980 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1749980 + +Can't access + +FORBIDDEN + +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/857397 +base_url | http://www.irgrid.ac.cn/handle/1471x/857397 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/857397 + +Just linking to another IR; skip it. + +http://ir.ipe.ac.cn/handle/122111/10608 + +requires login + +DONE: '/password-login;jsessionid' as a loginwall URL pattern + http://ir.ipe.ac.cn/handle/122111/10608 + http://ir.ipe.ac.cn/bitstream/122111/10608/2/%e9%92%9d%e9%a1%b6%e8%9e%ba%e6%97%8b%e8%97%bb%e5%9c%a8%e4%b8%8d%e5%90%8c%e5%85%89%e7%85%a7%e6%9d%a1%e4%bb%b6%e4%b8%8b%e7%9a%84%e6%94%be%e6%b0%a7%e7%89%b9%e6%80%a7_%e8%96%9b%e5%8d%87%e9%95%bf.pdf + +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1060447 +base_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1060447 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1671377 +base_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +terminal_url | http://ir.iggcas.ac.cn/handle/132A11/68622 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/1178430 +base_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/1178430 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2488017 +base_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/2488017 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/977147 +base_url | http://www.irgrid.ac.cn/handle/1471x/977147 +terminal_url | http://www.irgrid.ac.cn/handle/1471x/977147 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:www.irgrid.ac.cn:1471x/2454503 +base_url | http://ir.nwipb.ac.cn/handle/363003/9957 +terminal_url | http://ir.nwipb.ac.cn/handle/363003/9957 + +this domain is a disapointment :( + +should continue crawling, as the metadata is open and good. but won't get fulltext? + +### hal (FIXED-PARTIAL) + +-[ RECORD 1 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00744951v1 +base_url | https://hal.archives-ouvertes.fr/hal-00744951 +terminal_url | https://hal.archives-ouvertes.fr/hal-00744951 + +Off-site OA link. + +FIXED-HAL + +-[ RECORD 2 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-01065398v1 +base_url | https://hal.archives-ouvertes.fr/hal-01065398/file/AbstractSGE14_B_assaad.pdf +terminal_url | https://hal.archives-ouvertes.fr/index/index +-[ RECORD 3 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:lirmm-00371599v1 +base_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 +terminal_url | https://hal-lirmm.ccsd.cnrs.fr/lirmm-00371599 + +To elsevier :( + +-[ RECORD 4 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00284780v1 +base_url | https://hal.archives-ouvertes.fr/hal-00284780 +terminal_url | https://hal.archives-ouvertes.fr/hal-00284780 + +METADATA-ONLY + +-[ RECORD 5 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00186151v1 +base_url | https://hal.archives-ouvertes.fr/hal-00186151 +terminal_url | https://hal.archives-ouvertes.fr/hal-00186151 + +METADATA-ONLY + +-[ RECORD 6 ]+------------------------------------------------------------------------------ +oai_id | oai:hal:hal-00399754v1 +base_url | https://hal.archives-ouvertes.fr/hal-00399754 +terminal_url | https://hal.archives-ouvertes.fr/hal-00399754 + +METADATA-ONLY + + +### espace.library.uq.edu.au (SKIP) + +-[ RECORD 1 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:136497 +base_url | https://espace.library.uq.edu.au/view/UQ:136497 +terminal_url | https://espace.library.uq.edu.au/view/UQ:136497 +-[ RECORD 2 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:411389 +base_url | https://espace.library.uq.edu.au/view/UQ:411389 +terminal_url | https://espace.library.uq.edu.au/view/UQ:411389 +-[ RECORD 3 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:401773 +base_url | https://espace.library.uq.edu.au/view/UQ:401773 +terminal_url | https://espace.library.uq.edu.au/view/UQ:401773 +-[ RECORD 4 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:675334 +base_url | https://espace.library.uq.edu.au/view/UQ:675334 +terminal_url | https://espace.library.uq.edu.au/view/UQ:675334 +-[ RECORD 5 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:312311 +base_url | https://espace.library.uq.edu.au/view/UQ:312311 +terminal_url | https://espace.library.uq.edu.au/view/UQ:312311 +-[ RECORD 6 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:209401 +base_url | https://espace.library.uq.edu.au/view/UQ:209401 +terminal_url | https://espace.library.uq.edu.au/view/UQ:209401 +-[ RECORD 7 ]+------------------------------------------------ +oai_id | oai:espace.library.uq.edu.au:uq:327188 +base_url | https://espace.library.uq.edu.au/view/UQ:327188 +terminal_url | https://espace.library.uq.edu.au/view/UQ:327188 + +Very javascript heavy (skeletal HTML). And just links to fulltext on publisher +sites. + +### igi.indrastra.com (METADATA-ONLY) + +-[ RECORD 1 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:267221 +base_url | http://igi.indrastra.com/items/show/267221 +terminal_url | http://igi.indrastra.com/items/show/267221 +-[ RECORD 2 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:181799 +base_url | http://igi.indrastra.com/items/show/181799 +terminal_url | http://igi.indrastra.com/items/show/181799 +-[ RECORD 3 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:125382 +base_url | http://igi.indrastra.com/items/show/125382 +terminal_url | http://igi.indrastra.com/items/show/125382 +-[ RECORD 4 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:47266 +base_url | http://igi.indrastra.com/items/show/47266 +terminal_url | http://igi.indrastra.com/items/show/47266 +-[ RECORD 5 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:12872 +base_url | http://igi.indrastra.com/items/show/12872 +terminal_url | http://igi.indrastra.com/items/show/12872 +-[ RECORD 6 ]+--------------------------------------------------------- +oai_id | oai:igi.indrastra.com:231620 +base_url | http://igi.indrastra.com/items/show/231620 +terminal_url | http://igi.indrastra.com/items/show/231620 + +"Proudly powered by Omeka" + +### invenio.nusl.cz (METADATA-ONLY) + + oai_id | base_url | terminal_url +----------------------------+------------------------------------+-------------------------------------- + oai:invenio.nusl.cz:237409 | http://www.nusl.cz/ntk/nusl-237409 | http://invenio.nusl.cz/record/237409 + oai:invenio.nusl.cz:180783 | http://www.nusl.cz/ntk/nusl-180783 | http://invenio.nusl.cz/record/180783 + oai:invenio.nusl.cz:231961 | http://www.nusl.cz/ntk/nusl-231961 | http://invenio.nusl.cz/record/231961 + oai:invenio.nusl.cz:318800 | http://www.nusl.cz/ntk/nusl-318800 | http://invenio.nusl.cz/record/318800 + oai:invenio.nusl.cz:259695 | http://www.nusl.cz/ntk/nusl-259695 | http://invenio.nusl.cz/record/259695 + oai:invenio.nusl.cz:167393 | http://www.nusl.cz/ntk/nusl-167393 | http://invenio.nusl.cz/record/167393 + oai:invenio.nusl.cz:292987 | http://www.nusl.cz/ntk/nusl-292987 | http://invenio.nusl.cz/record/292987 + oai:invenio.nusl.cz:283396 | http://www.nusl.cz/ntk/nusl-283396 | http://invenio.nusl.cz/record/283396 + oai:invenio.nusl.cz:241512 | http://www.nusl.cz/ntk/nusl-241512 | http://invenio.nusl.cz/record/241512 + oai:invenio.nusl.cz:178631 | http://www.nusl.cz/ntk/nusl-178631 | http://invenio.nusl.cz/record/178631 + +Metadata only (at least this set) + +### hypotheses.org + +-[ RECORD 1 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:mittelalter/9529 +base_url | http://mittelalter.hypotheses.org/9529 +terminal_url | https://mittelalter.hypotheses.org/9529 +-[ RECORD 2 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/18638 +base_url | http://archivalia.hypotheses.org/18638 +terminal_url | https://archivalia.hypotheses.org/18638 +-[ RECORD 3 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivalia/13614 +base_url | http://archivalia.hypotheses.org/13614 +terminal_url | https://archivalia.hypotheses.org/13614 +-[ RECORD 4 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:teteschercheuses/2785 +base_url | http://teteschercheuses.hypotheses.org/2785 +terminal_url | https://teteschercheuses.hypotheses.org/2785 +-[ RECORD 5 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:altervsego/608 +base_url | http://altervsego.hypotheses.org/608 +terminal_url | http://altervsego.hypotheses.org/608 +-[ RECORD 6 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:archivewk1/21905 +base_url | http://archivewk1.hypotheses.org/21905 +terminal_url | https://archivewk1.hypotheses.org/21905 +-[ RECORD 7 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:slkdiaspo/3321 +base_url | http://slkdiaspo.hypotheses.org/3321 +terminal_url | https://slkdiaspo.hypotheses.org/3321 +-[ RECORD 8 ]+--------------------------------------------- +oai_id | oai:hypotheses.org:diga/280 +base_url | http://diga.hypotheses.org/280 +terminal_url | https://diga.hypotheses.org/280 + +These are all a big mix... basically blogs. Should continue crawling, but expect no yield. + +### t2r2.star.titech.ac.jp (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00105099 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100499795 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00101346 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100495549 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50161100 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100632554 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:00232407 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100527528 +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50120040 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100612598 +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50321440 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100713492 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50235666 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100668778 + + +### quod.lib.umich.edu + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0015.003-2 +base_url | http://name.umdl.umich.edu/acf2679.0015.003 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0015.003 +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:b14970.0001.001 +base_url | http://name.umdl.umich.edu/B14970.0001.001 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=eebo2;idno=B14970.0001.001 +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acf2679.0009.010-3 +base_url | http://name.umdl.umich.edu/ACF2679-1623SOUT-209 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acf2679.0009.010;node=acf2679.0009.010:3 +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-16.006-43 +base_url | http://name.umdl.umich.edu/acg2248.1-16.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-16.006 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg2248.1-14.011-9 +base_url | http://name.umdl.umich.edu/ACG2248-1489LADI-364 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg2248.1-14.011;node=acg2248.1-14.011:9 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:acg1336.1-24.006-9 +base_url | http://name.umdl.umich.edu/acg1336.1-24.006 +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=moajrnl;idno=acg1336.1-24.006 +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------- +oai_id | oai:quod.lib.umich.edu:africanamer.0002.32a +base_url | http://name.umdl.umich.edu/africanamer.0002.32a +terminal_url | https://quod.lib.umich.edu/cgi/t/text/text-idx?c=africanamer;idno=africanamer.0002.32a + +These are... issues of journals? Should continue to crawl, but not expect much. + +### evastar-karlsruhe.de (METADATA-ONLY) + +-[ RECORD 1 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:270011444 +base_url | https://publikationen.bibliothek.kit.edu/270011444 +terminal_url | https://publikationen.bibliothek.kit.edu/270011444 +-[ RECORD 2 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000050117 +base_url | https://publikationen.bibliothek.kit.edu/1000050117 +terminal_url | https://publikationen.bibliothek.kit.edu/1000050117 +-[ RECORD 3 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:362296 +base_url | https://publikationen.bibliothek.kit.edu/362296 +terminal_url | https://publikationen.bibliothek.kit.edu/362296 +-[ RECORD 4 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:23042000 +base_url | https://publikationen.bibliothek.kit.edu/23042000 +terminal_url | https://publikationen.bibliothek.kit.edu/23042000 +-[ RECORD 5 ]+---------------------------------------------------- +oai_id | oai:evastar-karlsruhe.de:1000069945 +base_url | https://publikationen.bibliothek.kit.edu/1000069945 +terminal_url | https://publikationen.bibliothek.kit.edu/1000069945 + + +### repository.ust.hk + +-[ RECORD 1 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-67233 +base_url | http://repository.ust.hk/ir/Record/1783.1-67233 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-67233 +-[ RECORD 2 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-63232 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=A1981KV47900017 +terminal_url | http://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253AA1981KV47900017%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=http%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 3 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-2891 +base_url | http://gateway.isiknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=LinksAMR&SrcApp=PARTNER_APP&DestLinkType=FullRecord&DestApp=WOS&KeyUT=000240035400103 +terminal_url | https://login.webofknowledge.com/error/Error?Src=IP&Alias=WOK5&Error=IPError&Params=DestParams%3D%253FUT%253DWOS%253A000240035400103%2526customersID%253DLinksAMR%2526product%253DWOS%2526action%253Dretrieve%2526mode%253DFullRecord%26DestApp%3DWOS%26SrcApp%3DPARTNER_APP%26SrcAuth%3DLinksAMR&PathInfo=%2F&RouterURL=https%3A%2F%2Fwww.webofknowledge.com%2F&Domain=.webofknowledge.com +-[ RECORD 4 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-56231 +base_url | http://repository.ust.hk/ir/Record/1783.1-56231 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-56231 + +[...] + +-[ RECORD 6 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-24872 +base_url | http://repository.ust.hk/ir/Record/1783.1-24872 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-24872 +-[ RECORD 7 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-3457 +base_url | http://lbdiscover.ust.hk/uresolver?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +terminal_url | http://lbdiscover.ust.hk/uresolver/?url_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:journal&rfr_id=info:sid/HKUST:SPI&rft.genre=article&rft.issn=0003-6870&rft.volume=40&rft.issue=2&rft.date=2009&rft.spage=267&rft.epage=279&rft.aulast=Witana&rft.aufirst=Channa+R.&rft.atitle=Effects+of+surface+characteristics+on+the+plantar+shape+of+feet+and+subjects'+perceived+sensations +-[ RECORD 8 ]+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-73215 +base_url | http://repository.ust.hk/ir/Record/1783.1-73215 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-73215 + +DONE: gateway.isiknowledge.com is bogus/blocking? + + +### edoc.mpg.de (SKIP-DEPRECATED) + + oai_id | base_url | terminal_url +------------------------+---------------------------+--------------------------- + oai:edoc.mpg.de:416650 | http://edoc.mpg.de/416650 | http://edoc.mpg.de/416650 + oai:edoc.mpg.de:8195 | http://edoc.mpg.de/8195 | http://edoc.mpg.de/8195 + oai:edoc.mpg.de:379655 | http://edoc.mpg.de/379655 | http://edoc.mpg.de/379655 + oai:edoc.mpg.de:641179 | http://edoc.mpg.de/641179 | http://edoc.mpg.de/641179 + oai:edoc.mpg.de:607141 | http://edoc.mpg.de/607141 | http://edoc.mpg.de/607141 + oai:edoc.mpg.de:544412 | http://edoc.mpg.de/544412 | http://edoc.mpg.de/544412 + oai:edoc.mpg.de:314531 | http://edoc.mpg.de/314531 | http://edoc.mpg.de/314531 + oai:edoc.mpg.de:405047 | http://edoc.mpg.de/405047 | http://edoc.mpg.de/405047 + oai:edoc.mpg.de:239650 | http://edoc.mpg.de/239650 | http://edoc.mpg.de/239650 + oai:edoc.mpg.de:614852 | http://edoc.mpg.de/614852 | http://edoc.mpg.de/614852 + +This whole instance seems to have been replaced + +### bibliotecadigital.jcyl.es (SKIP-DIGITIZED) + +-[ RECORD 1 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000039962 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10044664 +-[ RECORD 2 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14075 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14075 +-[ RECORD 3 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:4842 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=4842 +-[ RECORD 4 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:14799 +base_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/consulta/registro.cmd?id=14799 +-[ RECORD 5 ]+-------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:821 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=1003474 + +Digitized images as pages; too much to deal with for now. + +### orbi.ulg.ac.be + +-[ RECORD 1 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/128079 +base_url | https://orbi.uliege.be/handle/2268/128079 +terminal_url | https://orbi.uliege.be/handle/2268/128079 +-[ RECORD 2 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/67659 +base_url | https://orbi.uliege.be/handle/2268/67659 +terminal_url | https://orbi.uliege.be/handle/2268/67659 +-[ RECORD 3 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/35521 +base_url | https://orbi.uliege.be/handle/2268/35521 +terminal_url | https://orbi.uliege.be/handle/2268/35521 +-[ RECORD 4 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/107922 +base_url | https://orbi.uliege.be/handle/2268/107922 +terminal_url | https://orbi.uliege.be/handle/2268/107922 +-[ RECORD 5 ]+---------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/215694 +base_url | https://orbi.uliege.be/handle/2268/215694 +terminal_url | https://orbi.uliege.be/handle/2268/215694 + +Described below. + +### library.wur.nl (FIXED-BESPOKE) + + oai_id | base_url | terminal_url + -----------------------------------+------------------------------------------------+------------------------------------------------ + oai:library.wur.nl:wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 | https://library.wur.nl/WebQuery/wurpubs/440939 + oai:library.wur.nl:wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 | https://library.wur.nl/WebQuery/wurpubs/427707 + oai:library.wur.nl:wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 | https://library.wur.nl/WebQuery/wurpubs/359208 + oai:library.wur.nl:wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 | https://library.wur.nl/WebQuery/wurpubs/433378 + oai:library.wur.nl:wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 | https://library.wur.nl/WebQuery/wurpubs/36416 + oai:library.wur.nl:wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 | https://library.wur.nl/WebQuery/wurpubs/469930 + oai:library.wur.nl:wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 | https://library.wur.nl/WebQuery/wurpubs/350076 + oai:library.wur.nl:wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 | https://library.wur.nl/WebQuery/wurpubs/19109 + oai:library.wur.nl:wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 | https://library.wur.nl/WebQuery/wurpubs/26146 + oai:library.wur.nl:wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 | https://library.wur.nl/WebQuery/wurpubs/529922 + (10 rows) + +Seems like a one-off site? But added a pattern. + +### pure.atira.dk + +-[ RECORD 1 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/a27762fd-0919-4753-af55-00b9b26d02e0 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/hightech-cities-and-the-primitive-jungle-visionary-urbanism-in-europe-and-japan-of-the-1960s(a27762fd-0919-4753-af55-00b9b26d02e0).html +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/215c8b96-a821-4947-bee4-c7470e9fbaf8 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/service-recovery-in-health-services--understanding-the-desired-qualities-and-behaviours-of-general-practitioners-during-service-recovery-encounters(215c8b96-a821-4947-bee4-c7470e9fbaf8).html +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/95d4920a-12c7-4e25-b86c-5f075ea23a38 +base_url | https://www.tandfonline.com/doi/full/10.1080/03057070.2016.1197694 +terminal_url | https://www.tandfonline.com/action/cookieAbsent +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:pure.atira.dk:publications/8a2508ee-14c9-4c6a-851a-6db442090f41 +base_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html +terminal_url | https://www.research.manchester.ac.uk/portal/en/publications/microstructure-and-grain-size-dependence-of-ferroelectric-properties-of-batio3-thin-films-on-lanio3-buffered-si(8a2508ee-14c9-4c6a-851a-6db442090f41).html + +Metadata only + +DONE: /cookieAbsent is cookie block + https://www.tandfonline.com/action/cookieAbsent + +### bib-pubdb1.desy.de (FIXED-INVENIO) + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:96756 +base_url | http://bib-pubdb1.desy.de/record/96756 +terminal_url | http://bib-pubdb1.desy.de/record/96756 + +Metadata only. + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:416556 +base_url | http://bib-pubdb1.desy.de/record/416556 +terminal_url | http://bib-pubdb1.desy.de/record/416556 + +Fixed! + +-[ RECORD 4 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:414545 +base_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +terminal_url | http://bib-pubdb1.desy.de/search?p=id:%22PUBDB-2018-04027%22 +-[ RECORD 5 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:170169 +base_url | http://bib-pubdb1.desy.de/record/170169 +terminal_url | http://bib-pubdb1.desy.de/record/170169 +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:191154 +base_url | http://bib-pubdb1.desy.de/record/191154 +terminal_url | http://bib-pubdb1.desy.de/record/191154 + +Metadata only + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:155092 +base_url | http://bib-pubdb1.desy.de/record/155092 +terminal_url | http://bib-pubdb1.desy.de/record/155092 + +Fixed! + +-[ RECORD 8 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bib-pubdb1.desy.de:97158 +base_url | http://bib-pubdb1.desy.de/record/97158 +terminal_url | http://bib-pubdb1.desy.de/record/97158 + +Metadata only + +"Powered by Invenio v1.1.7" + +Can/should skip the "search" URLs + +### serval.unil.ch + +-[ RECORD 1 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_60346fc75171 +base_url | https://serval.unil.ch/notice/serval:BIB_60346FC75171 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_60346FC75171 +-[ RECORD 2 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_4db47fc4b593 +base_url | https://serval.unil.ch/notice/serval:BIB_4DB47FC4B593 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_4DB47FC4B593 +-[ RECORD 3 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_57aac24fe115 +base_url | http://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +terminal_url | https://nbn-resolving.org/urn/resolver.pl?urn=urn:nbn:ch:serval-BIB_57AAC24FE1154 +-[ RECORD 4 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_deabae6baf6c +base_url | https://serval.unil.ch/notice/serval:BIB_DEABAE6BAF6C +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DEABAE6BAF6C +-[ RECORD 5 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_a5ec0df1370f +base_url | https://serval.unil.ch/notice/serval:BIB_A5EC0DF1370F +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253Aed270c26d4a36cefd1bf6a840472abe0ee5556cb5f3b42de708f3ea984775dfd +-[ RECORD 6 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_080300c2e23c +base_url | https://serval.unil.ch/resource/serval:BIB_080300C2E23C.P001/REF.pdf +terminal_url | https://wayf.switch.ch/SWITCHaai/WAYF?entityID=https%3A%2F%2Fmy.unil.ch%2Fshibboleth&return=https%3A%2F%2Fserval.unil.ch%2FShibboleth.sso%2FLogin%3FSAMLDS%3D1%26target%3Dss%253Amem%253A154453d78a0fb75ffa220f7b6fe73b29447fa6ed048addf31897b41001f44679 +-[ RECORD 7 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_de777dd2b07f +base_url | https://serval.unil.ch/notice/serval:BIB_DE777DD2B07F +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_DE777DD2B07F +-[ RECORD 8 ]+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:serval.unil.ch:bib_5e824e244c27 +base_url | https://serval.unil.ch/notice/serval:BIB_5E824E244C27 +terminal_url | https://serval.unil.ch/en/notice/serval:BIB_5E824E244C27 + +Metadata only? See elsewhere. + +### Random Links + +-[ RECORD 1 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dbc.wroc.pl:41031 +base_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 +terminal_url | https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031 + +This is some platform/package thing. PDF is in an iframe. Platform is "DLibra". +FIXED-DLIBRA + +-[ RECORD 2 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:orbi.ulg.ac.be:2268/174291 +base_url | https://orbi.uliege.be/handle/2268/174291 +terminal_url | https://orbi.uliege.be/handle/2268/174291 + +DSpace platform. There are multiple files, and little to "select" on. + +https://orbi.uliege.be/handle/2268/174200 has only single PDF and easier to work with + +PARTIAL-DSPACE + +-[ RECORD 3 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.tue.nl:664163 +base_url | http://repository.tue.nl/664163 +terminal_url | http://repository.tue.nl/664163 + +Ah, this is the Pure platform from Elsevier. +Redirects to: https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance + +FIXED-PURE + + +-[ RECORD 4 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:juser.fz-juelich.de:49579 +base_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 +terminal_url | http://juser.fz-juelich.de/search?p=id:%22PreJuSER-49579%22 + +(handled above) + +-[ RECORD 5 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/97937 +base_url | https://orcid.org/0000-0002-2066-2082 +terminal_url | https://orcid.org/0000-0002-2066-2082 + +ORCID! Skip it. + +DONE: skip orcid.org in `terminal_url`, and/or at harvest/transform time. + +-[ RECORD 6 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:edoc.mpg.de:360269 +base_url | http://edoc.mpg.de/360269 +terminal_url | http://edoc.mpg.de/360269 + +Seems like this whole repo has disapeared, or been replaced by... pure? maybe a different pure? + +DONE: edoc.mpg.de -> pure.mpg.de + +-[ RECORD 7 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:books.openedition.org:msha/17716 +base_url | http://books.openedition.org/msha/17716 +terminal_url | https://books.openedition.org/msha/17716 + +Open edition is free to read HTML, but not PDF (or epub, etc). + +TODO: for some? all? openedition books records, try HTML ingest (not PDF ingest) + +HTML-WORKED + +-[ RECORD 8 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epub.oeaw.ac.at:0x003aba48 +base_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf +terminal_url | http://epub.oeaw.ac.at/?arp=8609-0inhalt/B02_2146_FP_Flores%20Castillo.pdf + +requires login + +FORBIDDEN + +-[ RECORD 9 ]+--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.mit.edu:1721.1/88986 +base_url | https://orcid.org/0000-0002-4147-2560 +terminal_url | https://orcid.org/0000-0002-4147-2560 + +DONE: skip orcids + +-[ RECORD 10 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.ust.hk:1783.1-28786 +base_url | http://repository.ust.hk/ir/Record/1783.1-28786 +terminal_url | http://repository.ust.hk/ir/Record/1783.1-28786 + +Generator: VuFind 5.1.1 +just a metadata record + +METADATA-ONLY + +-[ RECORD 11 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:rcin.org.pl:50797 +base_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 +terminal_url | http://195.187.71.10/ipac20/ipac.jsp?profile=iblpan&index=BOCLC&term=cc95215472 + +Seems like a software platform? not sure. + +METADATA-ONLY + +-[ RECORD 12 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dea.lib.unideb.hu:2437/69641 +base_url | http://webpac.lib.unideb.hu:8082/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 +terminal_url | https://webpac.lib.unideb.hu/WebPac/CorvinaWeb?action=cclfind&resultview=long&ccltext=idno+bibFSZ1008709 + +-[ RECORD 13 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/64871 +base_url | http://handle.unsw.edu.au/1959.4/64871 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_62832&context=L + +-[ RECORD 14 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:www.wbc.poznan.pl:225930 +base_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 +terminal_url | https://www.wbc.poznan.pl/dlibra/docmetadata?showContent=true&id=225930 + +SOFT-404 + +-[ RECORD 15 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:repository.erciyes.edu.tr:105 +base_url | http://repository.erciyes.edu.tr/bilimname/items/show/105 +terminal_url | http://repository.erciyes.edu.tr:80/bilimname/items/show/105 + +GONE (domain not registered) + +-[ RECORD 16 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:37500 +base_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 +terminal_url | https://archivum-laureshamense-digital.de/view/sad_a1_nr_20_13 + +Seems like a bespoke site + +SKIP-BESPOKE + +-[ RECORD 17 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:t2r2.star.titech.ac.jp:50401364 +base_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 +terminal_url | http://t2r2.star.titech.ac.jp/cgi-bin/publicationinfo.cgi?q_publication_content_number=CTT100758313 + +METADATA-ONLY + +-[ RECORD 18 ]--------------------------------------------------------------------------------------------------------------------- +oai_id | oai:epubs.cclrc.ac.uk:work/4714 +base_url | http://purl.org/net/epubs/work/4714 +terminal_url | https://epubs.stfc.ac.uk/work/4714 + +It's got a purl! haha. + +METADATA-ONLY + +------ + +Another batch! With some repeat domains removed. + +-[ RECORD 1 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:cris.vtt.fi:persons/142c030f-ba7b-491a-8669-a361088355cc +base_url | https://cris.vtt.fi/en/persons/142c030f-ba7b-491a-8669-a361088355cc +terminal_url | https://cris.vtt.fi/en/persons/oleg-antropov + +SKIP + +-[ RECORD 2 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:etd.adm.unipi.it:etd-05302014-183910 +base_url | http://etd.adm.unipi.it/theses/available/etd-05302014-183910/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-05302014-183910/ + +Some software platform? Pretty basic/bespoke + +FIXED-PARTIAL + +-[ RECORD 3 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:bibliotecadigital.jcyl.es:10000098246 +base_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 +terminal_url | http://bibliotecadigital.jcyl.es/i18n/catalogo_imagenes/grupo.cmd?path=10316451 + +SKIP (see elsewhere) + +-[ RECORD 7 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:elektra.cdaea.es:documento.29259 +base_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 +terminal_url | https://www.juntadeandalucia.es/cultura/cdaea/elektra/catalogo_execute.html?tipoObjeto=1&id=29259 + +Photo. + +SKIP-SCOPE + +-[ RECORD 9 ]+----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:unsworks.library.unsw.edu.au:1959.4/unsworks_60829 +base_url | http://handle.unsw.edu.au/1959.4/unsworks_60829 +terminal_url | https://www.unsworks.unsw.edu.au/primo-explore/fulldisplay?vid=UNSWORKS&docid=unsworks_modsunsworks_60829&context=L + +METADATA-ONLY + +-[ RECORD 12 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:pure.leuphana.de:publications/7d040cf2-b3b5-4671-8906-76b5bc8d870a +base_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html +terminal_url | http://fox.leuphana.de/portal/de/publications/studies-in-childrens-literature-1500--2000-editors-celia-keenan-(7d040cf2-b3b5-4671-8906-76b5bc8d870a).html + +unsure + +-[ RECORD 16 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:library.wur.nl:wurpubs/369344 +base_url | https://library.wur.nl/WebQuery/wurpubs/369344 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/369344 + +this specific record not OA (but site is fine/fixed) + +-[ RECORD 17 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:escholarship.umassmed.edu:oapubs-2146 +base_url | https://escholarship.umassmed.edu/oapubs/1147 +terminal_url | http://escholarship.umassmed.edu/oapubs/1147/ + +just links to publisher (no content in repo) + +-[ RECORD 18 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digitalcommons.usu.edu:wild_facpub-1010 +base_url | https://digitalcommons.usu.edu/wild_facpub/11 +terminal_url | http://digitalcommons.usu.edu/wild_facpub/11/ + +also just links to publisher (no content in repo) + +-[ RECORD 25 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:igi.indrastra.com:306768 +base_url | http://igi.indrastra.com/items/show/306768 +terminal_url | http://igi.indrastra.com/items/show/306768 + +(see elsewhere) + +-[ RECORD 26 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:fau.digital.flvc.org:fau_9804 +base_url | http://purl.flvc.org/fcla/dt/12932 +terminal_url | http://fau.digital.flvc.org/islandora/object/fau%3A9804 + +Islandora. + +-[ RECORD 27 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:dspace.lu.lv:7/16019 +base_url | https://dspace.lu.lv/dspace/handle/7/16019 +terminal_url | https://dspace.lu.lv/dspace/handle/7/16019 + +LOGINWALL + +-[ RECORD 28 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:zir.nsk.hr:umas_218 +base_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 +terminal_url | https://repozitorij.svkst.unist.hr/islandora/object/umas:218 + +REMOVED + + +-[ RECORD 29 ]----------------------------------------------------------------------------------------------------------------------------------------------------------------- +oai_id | oai:digi.ub.uni-heidelberg.de:36390 +base_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 +terminal_url | https://digi.hadw-bw.de/view/sbhadwmnkl_a_1917_5 + +Book, with chapters, not an individual work. + +-[ RECORD 2 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:krm.or.kr:10056135m201r +base_url | https://www.krm.or.kr/krmts/link.html?dbGubun=SD&m201_id=10056135&res=y +terminal_url | https://www.krm.or.kr/krmts/search/detailview/research.html?dbGubun=SD&category=Research&m201_id=10056135 + +research results repository; keep crawling + +SKIP-SCOPE + +-[ RECORD 3 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:www.db-thueringen.de:dbt_mods_00005191 +base_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 +terminal_url | https://www.db-thueringen.de/receive/dbt_mods_00005191 + +powered by "MyCoRe" + +FIXED-MYCORE + +-[ RECORD 6 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bibliotecavirtualandalucia.juntadeandalucia.es:1017405 +base_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 +terminal_url | http://www.bibliotecavirtualdeandalucia.es/catalogo/es/consulta/registro.cmd?id=1017405 + +seems to be a general purpose regional library? not research-specific + +SKIP-UNSURE + +-[ RECORD 7 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:etd.adm.unipi.it:etd-02272019-123644 +base_url | http://etd.adm.unipi.it/theses/available/etd-02272019-123644/ +terminal_url | https://etd.adm.unipi.it/theses/available/etd-02272019-123644/ + +This specific URL is not available (FORBIDDEN) + +others have multiple files, not just a single PDF: +https://etd.adm.unipi.it/t/etd-09102013-124430/ + +SKIP-UNSURE + +-[ RECORD 9 ]+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:commons.ln.edu.hk:sw_master-5408 +base_url | https://commons.ln.edu.hk/sw_master/4408 +terminal_url | https://commons.ln.edu.hk/sw_master/4408/ + +worth crawling I guess + +METADATA-ONLY + +-[ RECORD 10 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:mouseion.jax.org:ssbb1976-1224 +base_url | https://mouseion.jax.org/ssbb1976/225 +terminal_url | https://mouseion.jax.org/ssbb1976/225/ + +METADATA-ONLY + +-[ RECORD 13 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aleph.bib-bvb.de:bvb01-016604343 +base_url | http://bvbm1.bib-bvb.de/webclient/DeliveryManager?pid=176332&custom_att_2=simple_viewer +terminal_url | http://digital.bib-bvb.de/view/action/singleViewer.do?dvs=1593269021002~476&locale=en_US&VIEWER_URL=/view/action/singleViewer.do?&DELIVERY_RULE_ID=31&frameId=1&usePid1=true&usePid2=true + +SOFT-404 / FORBIDDEN (cookie timeout) + +-[ RECORD 14 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:bivaldi.gva.es:11740 +base_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 +terminal_url | https://bivaldi.gva.es/es/consulta/registro.do?id=11740 + + +-[ RECORD 16 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:library.wur.nl:wurpubs/443282 +base_url | https://library.wur.nl/WebQuery/wurpubs/443282 +terminal_url | https://library.wur.nl/WebQuery/wurpubs/443282 + +DIGIBIS platform (like some others) + +FIXED-PARTIAL + +-[ RECORD 18 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:hal:in2p3-00414135v1 +base_url | http://hal.in2p3.fr/in2p3-00414135 +terminal_url | http://hal.in2p3.fr:80/in2p3-00414135 + +METADATA-ONLY + +-[ RECORD 19 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:aaltodoc.aalto.fi:123456789/13201 +base_url | https://aaltodoc.aalto.fi/handle/123456789/13201 +terminal_url | https://aaltodoc.aalto.fi/handle/123456789/13201 + +This specific record is not accessible. +Another: https://aaltodoc.aalto.fi/handle/123456789/38002 + +DSpace 5.4 + +Worked (from recent changes) + + +-[ RECORD 20 ]------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +oai_id | oai:sedici.unlp.edu.ar:10915/40144 +base_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view +terminal_url | http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view + +This is a journal! Cool. Plone software platform. + +FIXED + +## Top no-capture Domains + +Top terminal no-capture domains: + + SELECT domain, COUNT(domain) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_file_result.status = 'no-capture' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 30; + + domain | count + -----------------------------------+------- + digitalrepository.unm.edu | 94087 + escholarship.org | 80632 + ir.opt.ac.cn | 70504 + idus.us.es | 67908 + www.cambridge.org | 56376 + www.ssoar.info | 52534 + rep.bntu.by | 52127 + scholarworks.umt.edu | 48546 + publikationen.ub.uni-frankfurt.de | 46987 + dk.um.si | 45753 + repositorio.uladech.edu.pe | 37028 + uu.diva-portal.org | 34929 + digitalcommons.law.byu.edu | 31732 + sedici.unlp.edu.ar | 31233 + elib.sfu-kras.ru | 29131 + jyx.jyu.fi | 28144 + www.repository.cam.ac.uk | 27728 + nagoya.repo.nii.ac.jp | 26673 + www.duo.uio.no | 25258 + www.persee.fr | 24968 + www2.senado.leg.br | 24426 + tesis.ucsm.edu.pe | 24049 + digitalcommons.unl.edu | 21974 + www.degruyter.com | 21940 + www.igi-global.com | 20736 + thekeep.eiu.edu | 20712 + docs.lib.purdue.edu | 20538 + repositorio.cepal.org | 20280 + elib.bsu.by | 19620 + minds.wisconsin.edu | 19473 + (30 rows) + +These all seem worth crawling. A couple publishers (cambridge.org), and +persee.fr will probably fail, but not too many URLs. + +## Summary of Filtered Prefixes and Domains (OAI-PMH) + +oai:kb.dk: + too large and generic +oai:bdr.oai.bsb-muenchen.de: + too large and generic +oai:hispana.mcu.es: + too large and generic +oai:bnf.fr: + too large and generic +oai:ukm.si: + too large and generic +oai:biodiversitylibrary.org: + redundant with other ingest and archive.org content +oai:hsp.org: + large; historical content only +oai:repec: + large; mostly (entirely?) links to publisher sites +oai:n/a: + meta? +oai:quod.lib.umich.edu: + entire issues? hard to crawl so skip for now +oai:hypotheses.org: + HTML, not PDF +oai:americanae.aecid.es: + large, complex. skip for now +oai:www.irgrid.ac.cn: + aggregator of other IRs +oai:espace.library.uq.edu.au: + large; metadata only; javascript heavy (poor heritrix crawling) +oai:edoc.mpg.de: + deprecated domain, with no redirects +oai:bibliotecadigital.jcyl.es: + digitized historical docs; hard to crawl, skip for now +oai:repository.erciyes.edu.tr: + gone (domain lapsed) +oai:krm.or.kr: + "research results repository" (metadata only) + +www.kb.dk + large, general purpose, scope +kb-images.kb.dk + deprecated +mdz-nbn-resolving.de + multiple prefixes end up here. historical docs, scope +aggr.ukm.um.si + large, out of scope +edoc.mpg.de + deprecated domain +doaj.org + index (metadata only) +orcid.org + out of scope +gateway.isiknowledge.com + clarivate login/payall (skipping in ingest) + +Needs filtering to a subset of records (by 'set' or other filtering?): + +oai:igi.indrastra.com: +oai:invenio.nusl.cz: +oai:t2r2.star.titech.ac.jp: +oai:evastar-karlsruhe.de: +oai:repository.ust.hk: +oai:serval.unil.ch: +oai:pure.atira.dk: + +FIlters in SQL syntax: + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + +and in some contexts (PDFs; switch to HTML): + + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + +## Overall Summary of OAI-PMH Stuff + +Big picture is that the majority of `no-pdf-link` crawl status are because of +repository scope, record scope, or content format issues. That being said, +there was a sizable fraction of sites which were platforms (like DSpace) which +were not ingesting well. + +A significant fraction of records are "metadata only" (of papers), or non-paper +entity types (like persons, grants, or journal titles), and a growing fraction +(?) are metadata plus link to OA publisher fulltext (offsite). Might be +possible to detect these at ingest time, or earlier at OAI-PMH +harvest/transform time and filter them out. + +It may be worthwhile to attempt ingest of multiple existing captures +(timestamps) in the ingest pipeline. Eg, instead of chosing a single "best" +capture, if there are multiple HTTP 200 status captures, try ingest with each +(or at least a couple). This is because repository software gets upgraded, so +old "no-capture" or "not found" or "link loop" type captures may work when +recrawled. + +New summary with additional filters: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -----------------------+---------- + success | 12872279 + no-pdf-link | 9329602 + no-capture | 4696362 + redirect-loop | 1541458 + terminal-bad-status | 660418 + link-loop | 452831 + wrong-mimetype | 434868 + null-body | 71065 + cdx-error | 17005 + | 15275 + petabox-error | 12743 + wayback-error | 11759 + skip-url-blocklist | 182 + gateway-timeout | 122 + redirects-exceeded | 120 + bad-redirect | 117 + bad-gzip-encoding | 111 + wayback-content-error | 102 + timeout | 72 + blocked-cookie | 62 + (20 rows) + diff --git a/notes/ingest/2021-09-03_daily_improvements.md b/notes/ingest/2021-09-03_daily_improvements.md new file mode 100644 index 0000000..a0bb0c5 --- /dev/null +++ b/notes/ingest/2021-09-03_daily_improvements.md @@ -0,0 +1,1021 @@ + +Periodic check-in of daily crawling/ingest. + +Overall ingest status, past 30 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + + ingest_type | status | count + -------------+-------------------------+-------- + pdf | no-pdf-link | 158474 + pdf | spn2-cdx-lookup-failure | 135344 + pdf | success | 127938 + pdf | spn2-error | 65411 + pdf | gateway-timeout | 63112 + pdf | blocked-cookie | 26338 + pdf | terminal-bad-status | 24853 + pdf | link-loop | 15699 + pdf | spn2-error:job-failed | 13862 + pdf | redirect-loop | 11432 + pdf | cdx-error | 2376 + pdf | too-many-redirects | 2186 + pdf | wrong-mimetype | 2142 + pdf | forbidden | 1758 + pdf | spn2-error:no-status | 972 + pdf | not-found | 820 + pdf | bad-redirect | 536 + pdf | read-timeout | 392 + pdf | wayback-error | 251 + pdf | remote-server-error | 220 + (20 rows) + +Hrm, that is a healthy fraction of `no-pdf-link`. + +Broken domains, past 30 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -------------------------+-------------------------+------- + zenodo.org | no-pdf-link | 39678 + osf.io | gateway-timeout | 29809 + acervus.unicamp.br | no-pdf-link | 21978 + osf.io | terminal-bad-status | 18727 + zenodo.org | spn2-cdx-lookup-failure | 17008 + doi.org | spn2-cdx-lookup-failure | 15503 + www.degruyter.com | no-pdf-link | 15122 + ieeexplore.ieee.org | spn2-error:job-failed | 12921 + osf.io | spn2-cdx-lookup-failure | 11123 + www.tandfonline.com | blocked-cookie | 8096 + www.morressier.com | no-pdf-link | 4655 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 4580 + pubs.acs.org | blocked-cookie | 4415 + www.frontiersin.org | no-pdf-link | 4163 + www.degruyter.com | spn2-cdx-lookup-failure | 3788 + www.taylorfrancis.com | no-pdf-link | 3568 + www.sciencedirect.com | no-pdf-link | 3128 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 3116 + acervus.unicamp.br | spn2-cdx-lookup-failure | 2797 + www.mdpi.com | spn2-cdx-lookup-failure | 2719 + brill.com | link-loop | 2681 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 2657 + www.sciencedirect.com | spn2-cdx-lookup-failure | 2546 + apps.crossref.org | no-pdf-link | 2537 + onlinelibrary.wiley.com | blocked-cookie | 2528 + (25 rows) + +Summary of significant domains and status, past 30 days, minus spn2-cdx-lookup-failure: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status != 'spn2-cdx-lookup-failure' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + + + domain | status | count + -----------------------------------------------------------------+-----------------------+-------- + academic.oup.com | | 2405 + academic.oup.com | no-pdf-link | 1240 + academic.oup.com | link-loop | 1010 + acervus.unicamp.br | | 21980 + acervus.unicamp.br | no-pdf-link | 21978 ** + aclanthology.org | | 208 + acp.copernicus.org | | 365 + acp.copernicus.org | success | 356 + aip.scitation.org | | 1071 + aip.scitation.org | blocked-cookie | 843 + aip.scitation.org | redirect-loop | 227 + apps.crossref.org | | 2537 + apps.crossref.org | no-pdf-link | 2537 + arxiv.org | | 17817 + arxiv.org | success | 17370 + arxiv.org | terminal-bad-status | 320 + asmedigitalcollection.asme.org | | 401 + asmedigitalcollection.asme.org | link-loop | 364 + assets.researchsquare.com | | 3706 + assets.researchsquare.com | success | 3706 + avmj.journals.ekb.eg | | 605 + avmj.journals.ekb.eg | success | 595 + bfa.journals.ekb.eg | | 224 + bfa.journals.ekb.eg | success | 214 + biorxiv.org | redirect-loop | 895 + biorxiv.org | | 895 + birdsoftheworld.org | | 286 + birdsoftheworld.org | no-pdf-link | 285 + bmjopen.bmj.com | success | 232 + bmjopen.bmj.com | | 232 + books.openedition.org | | 396 + books.openedition.org | no-pdf-link | 396 + brill.com | | 4272 + brill.com | link-loop | 2681 + brill.com | no-pdf-link | 1410 + cas.columbia.edu | | 1038 + cas.columbia.edu | no-pdf-link | 1038 ** + cdr.lib.unc.edu | | 513 + cdr.lib.unc.edu | success | 469 + chemrxiv.org | | 278 + chemrxiv.org | success | 275 + classiques-garnier.com | | 531 + classiques-garnier.com | no-pdf-link | 487 * + content.iospress.com | | 275 + content.iospress.com | link-loop | 230 + cris.maastrichtuniversity.nl | | 318 + cris.maastrichtuniversity.nl | success | 284 + cyberleninka.ru | | 1165 + cyberleninka.ru | success | 1134 + deepblue.lib.umich.edu | | 289 + dergipark.org.tr | | 1185 + dergipark.org.tr | success | 774 + dergipark.org.tr | no-pdf-link | 320 + didaktorika.gr | | 688 + didaktorika.gr | redirect-loop | 688 + digi.ub.uni-heidelberg.de | | 292 + digi.ub.uni-heidelberg.de | no-pdf-link | 292 + direct.mit.edu | | 236 + direct.mit.edu | no-pdf-link | 207 * + dl.acm.org | | 2319 + dl.acm.org | blocked-cookie | 2230 + dmtcs.episciences.org | | 733 + dmtcs.episciences.org | success | 730 + doi.ala.org.au | no-pdf-link | 2373 ** + doi.ala.org.au | | 2373 + doi.org | | 732 + doi.org | terminal-bad-status | 673 + downloads.hindawi.com | success | 1452 + downloads.hindawi.com | | 1452 + drive.google.com | | 216 + drive.google.com | no-pdf-link | 211 + dtb.bmj.com | | 674 + dtb.bmj.com | link-loop | 669 + easy.dans.knaw.nl | no-pdf-link | 261 * + easy.dans.knaw.nl | | 261 + ebooks.marilia.unesp.br | | 688 + ebooks.marilia.unesp.br | no-pdf-link | 688 * + ehp.niehs.nih.gov | | 766 + ehp.niehs.nih.gov | blocked-cookie | 765 + ejournal.mandalanursa.org | | 307 + ejournal.mandalanursa.org | success | 305 + elib.spbstu.ru | | 264 + elib.spbstu.ru | redirect-loop | 257 + elibrary.ru | | 1367 + elibrary.ru | redirect-loop | 1169 + elibrary.vdi-verlag.de | | 1251 + elibrary.vdi-verlag.de | no-pdf-link | 646 + elibrary.vdi-verlag.de | link-loop | 537 + elifesciences.org | | 328 + elifesciences.org | success | 323 + figshare.com | | 803 + figshare.com | no-pdf-link | 714 * + files.osf.io | | 745 + files.osf.io | success | 614 + hammer.purdue.edu | | 244 + hammer.purdue.edu | no-pdf-link | 243 + heiup.uni-heidelberg.de | | 277 + heiup.uni-heidelberg.de | no-pdf-link | 268 + hkvalidate.perfdrive.com | no-pdf-link | 370 * + hkvalidate.perfdrive.com | | 370 + ieeexplore.ieee.org | | 16675 + ieeexplore.ieee.org | spn2-error:job-failed | 12927 + ieeexplore.ieee.org | success | 1952 + ieeexplore.ieee.org | too-many-redirects | 1193 + ieeexplore.ieee.org | no-pdf-link | 419 + jamanetwork.com | | 339 + jamanetwork.com | success | 216 + jmstt.ntou.edu.tw | | 244 + jmstt.ntou.edu.tw | success | 241 + journal.ipb.ac.id | | 229 + journal.ipb.ac.id | success | 206 + journal.nafe.org | | 221 + journals.aps.org | | 614 + journals.aps.org | gateway-timeout | 495 + journals.asm.org | | 463 + journals.asm.org | blocked-cookie | 435 + journals.flvc.org | | 230 + journals.lww.com | | 1300 + journals.lww.com | link-loop | 1284 + journals.openedition.org | | 543 + journals.openedition.org | success | 311 + journals.ub.uni-heidelberg.de | | 357 + journals.ub.uni-heidelberg.de | success | 311 + jov.arvojournals.org | | 431 + jov.arvojournals.org | no-pdf-link | 422 * + kiss.kstudy.com | | 303 + kiss.kstudy.com | no-pdf-link | 303 * + library.iated.org | | 364 + library.iated.org | redirect-loop | 264 + library.seg.org | blocked-cookie | 301 + library.seg.org | | 301 + link.aps.org | redirect-loop | 442 + link.aps.org | | 442 + linkinghub.elsevier.com | | 515 + linkinghub.elsevier.com | gateway-timeout | 392 + mc.sbm.org.br | | 224 + mc.sbm.org.br | success | 224 + mdpi-res.com | | 742 + mdpi-res.com | success | 742 + mdsoar.org | | 220 + mediarep.org | | 269 + mediarep.org | success | 264 + medrxiv.org | redirect-loop | 290 + medrxiv.org | | 290 + muse.jhu.edu | | 429 + muse.jhu.edu | terminal-bad-status | 391 + mvmj.journals.ekb.eg | | 306 + oapub.org | | 292 + oapub.org | success | 289 + onepetro.org | | 426 + onepetro.org | link-loop | 406 + onlinelibrary.wiley.com | | 2835 + onlinelibrary.wiley.com | blocked-cookie | 2531 + onlinelibrary.wiley.com | redirect-loop | 264 + open.library.ubc.ca | | 569 + open.library.ubc.ca | no-pdf-link | 425 * + opendata.uni-halle.de | | 407 + opendata.uni-halle.de | success | 263 + osf.io | | 49022 + osf.io | gateway-timeout | 29810 + osf.io | terminal-bad-status | 18731 + osf.io | spn2-error | 247 + osf.io | not-found | 205 + oxford.universitypressscholarship.com | | 392 + oxford.universitypressscholarship.com | link-loop | 233 + panor.ru | no-pdf-link | 433 * + panor.ru | | 433 + papers.ssrn.com | | 1630 + papers.ssrn.com | link-loop | 1598 + pdf.sciencedirectassets.com | | 3063 + pdf.sciencedirectassets.com | success | 3063 + peerj.com | | 464 + peerj.com | no-pdf-link | 303 * + periodicos.ufpe.br | | 245 + periodicos.ufpe.br | success | 232 + periodicos.unb.br | | 230 + periodicos.unb.br | success | 221 + preprints.jmir.org | | 548 + preprints.jmir.org | cdx-error | 499 + publications.rwth-aachen.de | | 213 + publikationen.bibliothek.kit.edu | | 346 + publikationen.bibliothek.kit.edu | success | 314 + publikationen.uni-tuebingen.de | | 623 + publikationen.uni-tuebingen.de | no-pdf-link | 522 * + publons.com | no-pdf-link | 934 * + publons.com | | 934 + pubs.acs.org | | 4507 + pubs.acs.org | blocked-cookie | 4406 + pubs.rsc.org | | 1638 + pubs.rsc.org | link-loop | 1054 + pubs.rsc.org | redirect-loop | 343 + pubs.rsc.org | success | 201 + repositorio.ufu.br | | 637 + repositorio.ufu.br | success | 607 + repository.dri.ie | | 1852 + repository.dri.ie | no-pdf-link | 1852 ** + repository.library.brown.edu | | 293 + repository.library.brown.edu | no-pdf-link | 291 * + res.mdpi.com | | 10367 + res.mdpi.com | success | 10360 + retrovirology.biomedcentral.com | | 230 + revistas.ufrj.br | | 284 + revistas.ufrj.br | success | 283 + revistas.uptc.edu.co | | 385 + revistas.uptc.edu.co | success | 344 + royalsocietypublishing.org | | 231 + rsdjournal.org | | 347 + rsdjournal.org | success | 343 + s3-ap-southeast-2.amazonaws.com | | 400 + s3-ap-southeast-2.amazonaws.com | success | 392 + s3-eu-west-1.amazonaws.com | | 2096 + s3-eu-west-1.amazonaws.com | success | 2091 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 289 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 286 + s3.ca-central-1.amazonaws.com | | 202 + sage.figshare.com | | 242 + sage.figshare.com | no-pdf-link | 241 + sajeb.org | | 246 + sajeb.org | no-pdf-link | 243 + scholar.dkyobobook.co.kr | | 332 + scholar.dkyobobook.co.kr | no-pdf-link | 328 * + search.mandumah.com | | 735 + search.mandumah.com | redirect-loop | 726 + secure.jbs.elsevierhealth.com | | 1112 + secure.jbs.elsevierhealth.com | blocked-cookie | 1108 + stm.bookpi.org | no-pdf-link | 468 * + stm.bookpi.org | | 468 + storage.googleapis.com | | 1012 + storage.googleapis.com | success | 1012 + tandf.figshare.com | | 469 + tandf.figshare.com | no-pdf-link | 466 + teses.usp.br | | 739 + teses.usp.br | success | 730 + tidsskrift.dk | | 360 + tidsskrift.dk | success | 346 + tiedejaedistys.journal.fi | | 224 + tind-customer-agecon.s3.amazonaws.com | success | 332 + tind-customer-agecon.s3.amazonaws.com | | 332 + valep.vc.univie.ac.at | no-pdf-link | 280 + valep.vc.univie.ac.at | | 280 + watermark.silverchair.com | | 1729 + watermark.silverchair.com | success | 1719 + www.academia.edu | | 387 + www.academia.edu | no-pdf-link | 386 + www.ahajournals.org | | 430 + www.ahajournals.org | blocked-cookie | 413 + www.atenaeditora.com.br | | 572 + www.atenaeditora.com.br | terminal-bad-status | 513 + www.atlantis-press.com | success | 722 + www.atlantis-press.com | | 722 + www.aup-online.com | | 419 + www.aup-online.com | no-pdf-link | 419 * + www.beck-elibrary.de | | 269 + www.beck-elibrary.de | no-pdf-link | 268 * + www.biodiversitylibrary.org | no-pdf-link | 528 * + www.biodiversitylibrary.org | | 528 + www.bloomsburycollections.com | | 623 + www.bloomsburycollections.com | no-pdf-link | 605 * + www.cabi.org | | 2191 + www.cabi.org | no-pdf-link | 2186 * + www.cairn.info | | 1283 + www.cairn.info | no-pdf-link | 713 + www.cairn.info | link-loop | 345 + www.cambridge.org | | 4128 + www.cambridge.org | no-pdf-link | 1531 + www.cambridge.org | success | 1441 + www.cambridge.org | link-loop | 971 + www.cureus.com | no-pdf-link | 526 * + www.cureus.com | | 526 + www.dbpia.co.kr | | 637 + www.dbpia.co.kr | redirect-loop | 631 + www.deboni.he.com.br | | 382 + www.deboni.he.com.br | success | 381 + www.degruyter.com | | 17783 + www.degruyter.com | no-pdf-link | 15102 + www.degruyter.com | success | 2584 + www.dovepress.com | | 480 + www.dovepress.com | success | 472 + www.e-manuscripta.ch | | 1350 + www.e-manuscripta.ch | no-pdf-link | 1350 * + www.e-periodica.ch | | 1276 + www.e-periodica.ch | no-pdf-link | 1275 + www.e-rara.ch | | 202 + www.e-rara.ch | no-pdf-link | 202 + www.elgaronline.com | | 495 + www.elgaronline.com | link-loop | 290 + www.elibrary.ru | | 922 + www.elibrary.ru | no-pdf-link | 904 + www.emerald.com | | 2155 + www.emerald.com | no-pdf-link | 1936 * + www.emerald.com | success | 219 + www.eurekaselect.com | | 518 + www.eurekaselect.com | no-pdf-link | 516 * + www.frontiersin.org | | 4163 + www.frontiersin.org | no-pdf-link | 4162 ** + www.hanser-elibrary.com | | 444 + www.hanser-elibrary.com | blocked-cookie | 444 + www.hanspub.org | | 334 + www.hanspub.org | no-pdf-link | 314 + www.idunn.no | | 1736 + www.idunn.no | link-loop | 596 + www.idunn.no | success | 577 + www.idunn.no | no-pdf-link | 539 + www.igi-global.com | terminal-bad-status | 458 + www.igi-global.com | | 458 + www.ijcai.org | | 533 + www.ijcai.org | success | 532 + www.ijraset.com | success | 385 + www.ijraset.com | | 385 + www.inderscience.com | | 712 + www.inderscience.com | no-pdf-link | 605 * + www.ingentaconnect.com | | 456 + www.ingentaconnect.com | no-pdf-link | 413 * + www.internationaljournalssrg.org | | 305 + www.internationaljournalssrg.org | no-pdf-link | 305 * + www.isca-speech.org | | 2392 + www.isca-speech.org | no-pdf-link | 2391 ** + www.journals.uchicago.edu | | 228 + www.journals.uchicago.edu | blocked-cookie | 227 + www.jstage.jst.go.jp | | 1492 + www.jstage.jst.go.jp | success | 1185 + www.jstage.jst.go.jp | no-pdf-link | 289 + www.jstor.org | | 301 + www.jurology.com | | 887 + www.jurology.com | redirect-loop | 887 + www.karger.com | | 318 + www.liebertpub.com | | 507 + www.liebertpub.com | blocked-cookie | 496 + www.morressier.com | | 4781 + www.morressier.com | no-pdf-link | 4655 ** + www.ncl.ecu.edu | | 413 + www.ncl.ecu.edu | success | 413 + www.nomos-elibrary.de | | 526 + www.nomos-elibrary.de | no-pdf-link | 391 + www.oecd-ilibrary.org | no-pdf-link | 1170 ** + www.oecd-ilibrary.org | | 1170 + www.openagrar.de | no-pdf-link | 221 + www.openagrar.de | | 221 + www.osapublishing.org | | 900 + www.osapublishing.org | link-loop | 615 + www.osapublishing.org | no-pdf-link | 269 + www.osti.gov | | 630 + www.osti.gov | link-loop | 573 + www.oxfordlawtrove.com | no-pdf-link | 476 * + www.oxfordlawtrove.com | | 476 + www.pdcnet.org | | 298 + www.pdcnet.org | terminal-bad-status | 262 + www.pedocs.de | | 203 + www.pnas.org | | 222 + www.preprints.org | | 372 + www.preprints.org | success | 366 + www.repository.cam.ac.uk | | 801 + www.repository.cam.ac.uk | success | 359 + www.repository.cam.ac.uk | no-pdf-link | 239 + www.research-collection.ethz.ch | | 276 + www.research-collection.ethz.ch | terminal-bad-status | 274 + www.revistas.usp.br | | 207 + www.revistas.usp.br | success | 204 + www.rina.org.uk | no-pdf-link | 1009 ** + www.rina.org.uk | | 1009 + www.schweizerbart.de | no-pdf-link | 202 + www.schweizerbart.de | | 202 + www.scielo.br | | 544 + www.scielo.br | redirect-loop | 526 + www.sciencedirect.com | | 3901 + www.sciencedirect.com | no-pdf-link | 3127 ** + www.sciencedirect.com | link-loop | 701 + www.sciendo.com | | 384 + www.sciendo.com | success | 363 + www.sciengine.com | | 225 + www.scirp.org | | 209 + www.spandidos-publications.com | | 205 + www.tandfonline.com | | 8925 + www.tandfonline.com | blocked-cookie | 8099 + www.tandfonline.com | terminal-bad-status | 477 + www.tandfonline.com | redirect-loop | 322 + www.taylorfrancis.com | | 6119 + www.taylorfrancis.com | no-pdf-link | 3567 + www.taylorfrancis.com | link-loop | 2169 + www.taylorfrancis.com | terminal-bad-status | 353 + www.thieme-connect.de | | 1047 + www.thieme-connect.de | redirect-loop | 472 + www.thieme-connect.de | spn2-error:job-failed | 343 + www.tib.eu | | 206 + www.trp.org.in | | 311 + www.trp.org.in | success | 311 + www.un-ilibrary.org | no-pdf-link | 597 * + www.un-ilibrary.org | | 597 + www.vr-elibrary.de | | 775 + www.vr-elibrary.de | blocked-cookie | 774 + www.wjgnet.com | | 204 + www.wjgnet.com | no-pdf-link | 204 + www.worldscientific.com | | 974 + www.worldscientific.com | blocked-cookie | 971 + www.worldwidejournals.com | | 242 + www.worldwidejournals.com | no-pdf-link | 203 + www.wto-ilibrary.org | no-pdf-link | 295 + www.wto-ilibrary.org | | 295 + www.zora.uzh.ch | | 222 + zenodo.org | | 49460 + zenodo.org | no-pdf-link | 39721 + zenodo.org | success | 8954 + zenodo.org | wrong-mimetype | 562 + | | 445919 + | no-pdf-link | 168035 + | success | 140875 + | gateway-timeout | 31809 + | blocked-cookie | 26431 + | terminal-bad-status | 25625 + | link-loop | 19006 + | spn2-error:job-failed | 13962 + | redirect-loop | 12512 + | wrong-mimetype | 2302 + | spn2-error | 1689 + | too-many-redirects | 1203 + | bad-redirect | 732 + | cdx-error | 539 + | not-found | 420 + | spn2-error:no-status | 256 + (419 rows) + +Get random subsets by terminal domain: + + \x auto + SELECT + ingest_request.link_source_id AS link_source_id, + ingest_request.base_url as base_url , + ingest_file_result.terminal_url as terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%//DOMAIN/%' + ORDER BY random() + LIMIT 5; + +## acervus.unicamp.br + +Previously flagged as messy (2021-05_daily_improvements.md) + +## cas.columbia.edu + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-2ety-qm51 +base_url | https://doi.org/10.7916/d8-2ety-qm51 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-0zf6-d167 +base_url | https://doi.org/10.7916/d8-0zf6-d167 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-k6ha-sn43 +base_url | https://doi.org/10.7916/d8-k6ha-sn43 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-bj6t-eb07 +base_url | https://doi.org/10.7916/d8-bj6t-eb07 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.7916/d8-xjac-j502 +base_url | https://doi.org/10.7916/d8-xjac-j502 +terminal_url | https://cas.columbia.edu/cas/login?TARGET=https%3A%2F%2Fdlc.library.columbia.edu%2Fusers%2Fauth%2Fsaml%2Fcallback + +these are not public (loginwalls) + +DONE: '/login?TARGET=' as a login wall pattern + +## doi.ala.org.au + +Previously flagged as dataset repository; datacite metadata is wrong. (2021-05_daily_improvements.md) + +NOTE: look at ingesting datasets + +## www.isca-speech.org + +-[ RECORD 1 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2014-84 +base_url | https://doi.org/10.21437/interspeech.2014-84 +terminal_url | https://www.isca-speech.org/archive/interspeech_2014/li14b_interspeech.html +-[ RECORD 2 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2004-319 +base_url | https://doi.org/10.21437/interspeech.2004-319 +terminal_url | https://www.isca-speech.org/archive/interspeech_2004/delcroix04_interspeech.html +-[ RECORD 3 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-372 +base_url | https://doi.org/10.21437/interspeech.2006-372 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/lei06c_interspeech.html +-[ RECORD 4 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2015-588 +base_url | https://doi.org/10.21437/interspeech.2015-588 +terminal_url | https://www.isca-speech.org/archive/interspeech_2015/polzehl15b_interspeech.html +-[ RECORD 5 ]--+---------------------------------------------------------------------------------- +link_source_id | 10.21437/interspeech.2006-468 +base_url | https://doi.org/10.21437/interspeech.2006-468 +terminal_url | https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html + +Bespoke site. Added rule to sandcrawler. + +NOTE: re-ingest/recrawl all isca-speech.org no-pdf-link terminal URLs (fatcat-ingest?) + +## www.morressier.com + + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0002858v +base_url | https://doi.org/10.1115/1.0002858v +terminal_url | https://www.morressier.com/article/development-new-single-highdensity-heatflux-gauges-unsteady-heat-transfer-measurements-rotating-transonic-turbine/60f162805d86378f03b49af5 +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0003896v +base_url | https://doi.org/10.1115/1.0003896v +terminal_url | https://www.morressier.com/article/experimental-investigation-proton-exchange-membrane-fuel-cell-platinum-nafion-along-inplane-direction/60f16d555d86378f03b50038 +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0004476v +base_url | https://doi.org/10.1115/1.0004476v +terminal_url | https://www.morressier.com/article/effect-air-release-agents-performance-results-fabric-lined-bushings/60f16d585d86378f03b502d5 +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0001286v +base_url | https://doi.org/10.1115/1.0001286v +terminal_url | https://www.morressier.com/article/development-verification-modelling-practice-cfd-calculations-obtain-current-loads-fpso/60f15d3fe537565438d70ece +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.1115/1.0000315v +base_url | https://doi.org/10.1115/1.0000315v +terminal_url | https://www.morressier.com/article/fire-event-analysis-fire-frequency-estimation-japanese-nuclear-power-plant/60f15a6f5d86378f03b43874 + +Many of these seem to be presentations, as both video and slides. PDFs seem broken though. + +NOTE: add to list of interesting rich media to crawl/preserve (video+slides+data) + +## www.oecd-ilibrary.org + +Paywall (2021-05_daily_improvements.md) + +## www.rina.org.uk + +-[ RECORD 1 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.ws.2002.10 +base_url | https://doi.org/10.3940/rina.ws.2002.10 +terminal_url | https://www.rina.org.uk/showproducts.html?product=4116 +-[ RECORD 2 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.pass.2003.16 +base_url | https://doi.org/10.3940/rina.pass.2003.16 +terminal_url | https://www.rina.org.uk/showproducts.html?product=3566 +-[ RECORD 3 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin.2013.15 +base_url | https://doi.org/10.3940/rina.icsotin.2013.15 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8017 +-[ RECORD 4 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.wfa.2010.23 +base_url | https://doi.org/10.3940/rina.wfa.2010.23 +terminal_url | https://www.rina.org.uk/showproducts.html?product=8177 +-[ RECORD 5 ]--+------------------------------------------------------- +link_source_id | 10.3940/rina.icsotin15.2015.01 +base_url | https://doi.org/10.3940/rina.icsotin15.2015.01 +terminal_url | https://www.rina.org.uk/showproducts.html?product=7883 + +Site is broken in some way + +## www.sciencedirect.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.jhlste.2021.100332 +base_url | https://doi.org/10.1016/j.jhlste.2021.100332 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S1473837621000332 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.hazadv.2021.100006 +base_url | https://doi.org/10.1016/j.hazadv.2021.100006 +terminal_url | https://www.sciencedirect.com/science/article/pii/S2772416621000061/pdfft?md5=e51bfd495bb53073c7a379d25cb11a32&pid=1-s2.0-S2772416621000061-main.pdf +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-12-822844-9.00009-8 +base_url | https://doi.org/10.1016/b978-0-12-822844-9.00009-8 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780128228449000098 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/j.colcom.2021.100490 +base_url | https://doi.org/10.1016/j.colcom.2021.100490 +terminal_url | https://www.sciencedirect.com/science/article/abs/pii/S2215038221001308 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1016/b978-0-323-85245-6.00012-6 +base_url | https://doi.org/10.1016/b978-0-323-85245-6.00012-6 +terminal_url | https://www.sciencedirect.com/science/article/pii/B9780323852456000126 + +These no-pdf-url ones seem to just be not OA, which is expected for much of the +domain. + +## repository.dri.ie + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+--------------------------------------------- + 10.7486/dri.t148v5941 | https://doi.org/10.7486/dri.t148v5941 | https://repository.dri.ie/catalog/t148v5941 + 10.7486/dri.2z119c98f | https://doi.org/10.7486/dri.2z119c98f | https://repository.dri.ie/catalog/2z119c98f + 10.7486/dri.qf8621102 | https://doi.org/10.7486/dri.qf8621102 | https://repository.dri.ie/catalog/qf8621102 + 10.7486/dri.js95m457t | https://doi.org/10.7486/dri.js95m457t | https://repository.dri.ie/catalog/js95m457t + 10.7486/dri.c534vb726 | https://doi.org/10.7486/dri.c534vb726 | https://repository.dri.ie/catalog/c534vb726 + +"Digital repository of Ireland" + +Historical scanned content. Bespoke site. Fixed. + +NOTE: recrawl/retry this domain + +## www.frontiersin.org + +-[ RECORD 1 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/978-2-88971-147-5 +base_url | https://doi.org/10.3389/978-2-88971-147-5 +terminal_url | https://www.frontiersin.org/research-topics/9081/neuroimaging-approaches-to-the-study-of-tinnitus-and-hyperacusis +-[ RECORD 2 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fnins.2021.722592 +base_url | https://doi.org/10.3389/fnins.2021.722592 +terminal_url | https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full +-[ RECORD 3 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fcell.2021.683209 +base_url | https://doi.org/10.3389/fcell.2021.683209 +terminal_url | https://www.frontiersin.org/articles/10.3389/fcell.2021.683209/full +-[ RECORD 4 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fmicb.2021.692474 +base_url | https://doi.org/10.3389/fmicb.2021.692474 +terminal_url | https://www.frontiersin.org/articles/10.3389/fmicb.2021.692474/full +-[ RECORD 5 ]--+------------------------------------------------------------------------------------------------------------------ +link_source_id | 10.3389/fneur.2021.676527 +base_url | https://doi.org/10.3389/fneur.2021.676527 +terminal_url | https://www.frontiersin.org/articles/10.3389/fneur.2021.676527/full + +All the `/research-topics/` URLs are out of scope. + +NOTE: recrawl missing frontiersin.org articles for PDFs +NOTE: recrawl missing frontiersin.org articles for XML (?) + +------- + +## direct.mit.edu + +Previously "not available" (2021-05_daily_improvements.md) + +## figshare.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15052236.v6 +base_url | https://doi.org/10.6084/m9.figshare.15052236.v6 +terminal_url | https://figshare.com/articles/software/RCL-tree_rar/15052236/6 +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.14907846.v5 +base_url | https://doi.org/10.6084/m9.figshare.14907846.v5 +terminal_url | https://figshare.com/articles/book/Conservation_of_Limestone_Ecosystems_of_Malaysia_Part_I_Acknowledgements_Methodology_Overview_of_limestone_outcrops_in_Malaysia_References_Detailed_information_on_limestone_outcrops_of_the_states_Johor_Negeri_Sembilan_Terengganu_Selangor_Pe/14907846/5 +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15157614.v1 +base_url | https://doi.org/10.6084/m9.figshare.15157614.v1 +terminal_url | https://figshare.com/articles/software/code_for_NN-A72265C/15157614/1 +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.15172926.v1 +base_url | https://doi.org/10.6084/m9.figshare.15172926.v1 +terminal_url | https://figshare.com/articles/preprint/History_of_the_internet/15172926/1 +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.6084/m9.figshare.16532574.v1 +base_url | https://doi.org/10.6084/m9.figshare.16532574.v1 +terminal_url | https://figshare.com/articles/media/Helen_McConnell_How_many_trees_do_you_think_you_have_planted_/16532574/1 + +NOTE: can determine from the redirect URL, I guess. This is helpful for ingest! +Could also potentially correct fatcat release_type using this info. + +We seem to be getting the ones we can (eg, papers) just fine + +## hkvalidate.perfdrive.com + +Should be skipping/bailing on this domain, but not for some reason. + +-[ RECORD 1 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05cc +base_url | https://doi.org/10.3847/1538-4357/ac05cc +terminal_url | https://hkvalidate.perfdrive.com/?ssa=1716a049-aeaa-4a89-8f82-bd733adaa2e7&ssb=43981203877&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05cc&ssi=0774dd12-8427-4e27-a2ac-759c8cc2ec0e&ssk=support@shieldsquare.com&ssm=07370915269044035109047683305266&ssn=e69c743cc3d66619f960f924b562160d637e8d7f1b0f-d3bb-44d4-b075ed&sso=75a8bd85-4a097fb40f99bfb9c97b0a4ca0a38fd6d79513a466e82cc7&ssp=92054607321628531005162856888275586&ssq=33809984098158010864140981653938424553916&ssr=MjA3LjI0MS4yMjUuMTM5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 2 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac0429 +base_url | https://doi.org/10.3847/1538-4357/ac0429 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=12bca70d-0af4-4241-9c9b-384befd96a88&ssb=92559232428&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac0429&ssi=cff72ab0-8427-4acd-a0e7-db1b04cf7ce7&ssk=support@shieldsquare.com&ssm=27895673282814430105287068829605&ssn=9af36a8e10efd239c9367a2f31dde500f7455c4d5f45-bf11-4b99-ad29ea&sso=26bd22d2-b23e1bd9558f2fd9ed0768ef1acecb24715d1d463328a229&ssp=16502500621628222613162823304820671&ssq=11469693950387070477339503456478590533604&ssr=MjA3LjI0MS4yMjUuMTYw&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 3 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.1149/1945-7111/ac1a85 +base_url | https://doi.org/10.1149/1945-7111/ac1a85 +terminal_url | https://hkvalidate.perfdrive.com/?ssa=b0fef51a-0f44-476e-b951-3341bde6aa67&ssb=84929220393&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.1149%2F1945-7111%2Fac1a85&ssi=48c05577-8427-4421-acd3-735ca29a46e6&ssk=support@shieldsquare.com&ssm=81129482524077974103852241068134&ssn=cf6c261d2b20d518b2ebe57e40ffaec9ab4cd1955dcb-7877-4f5b-bc3b1e&sso=1d196cae-6850f1ed8143e460f2bfbb61a8ae15cfe6b53d3bcdc528ca&ssp=99289867941628195224162819241830491&ssq=16897595632212421273956322948987630170313&ssr=MjA3LjI0MS4yMjUuMjM2&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 4 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.35848/1882-0786/ac1b0d +base_url | https://doi.org/10.35848/1882-0786/ac1b0d +terminal_url | https://hkvalidate.perfdrive.com/?ssa=6debdd23-c46b-4b40-b73c-d5540f04454e&ssb=95627212532&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.35848%2F1882-0786%2Fac1b0d&ssi=78b34ff9-8427-4d07-a0db-78a3aa2c7332&ssk=support@shieldsquare.com&ssm=54055111549093989106852695053789&ssn=cb51949e15a02cb99a8d0b57c4d06327b72e8d5c87a8-d006-4ffa-939ffb&sso=1b7fd62d-8107746fe28fca252fd45ffa403937e272bf75b452b68d4a&ssp=77377533171628212164162820021422494&ssq=02679025218797637682252187852000657274192&ssr=MjA3LjI0MS4yMzMuMTIx&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= +-[ RECORD 5 ]--+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.3847/1538-4357/ac05ba +base_url | https://doi.org/10.3847/1538-4357/ac05ba +terminal_url | https://hkvalidate.perfdrive.com/?ssa=f127eb3d-6a05-459d-97f2-499715c04b13&ssb=06802230353&ssc=https%3A%2F%2Fiopscience.iop.org%2Farticle%2F10.3847%2F1538-4357%2Fac05ba&ssi=8d087719-8427-4046-91fb-5e96af401560&ssk=support@shieldsquare.com&ssm=21056861072205974105064006574997&ssn=d05a73cff6d9af57acd6e2c366e716176752e1164d39-b9a7-408c-837d11&sso=d3f38d1e-a562a19195042d7e471a5e4fab03b6ca16ff1711c7c61804&ssp=68781137401628744693162877909483738&ssq=79454859841502433261398415426689546750534&ssr=MjA3LjI0MS4yMzIuMTg5&sst=Mozilla/5.0%20(Windows%20NT%2010.0;%20Win64;%20x64)%20AppleWebKit/537.36%20(KHTML,%20like%20Gecko)%20Chrome/74.0.3729.169%20Safari/537.36&ssv=&ssw= + +Was failing to check against blocklist again at the end of attempts. + +Could retry all these to update status, but probably not worth it. + +## jov.arvojournals.org + + link_source_id | base_url | terminal_url +-----------------------+---------------------------------------+------------------------------------------------------------- + 10.1167/jov.21.9.1933 | https://doi.org/10.1167/jov.21.9.1933 | https://jov.arvojournals.org/article.aspx?articleid=2777021 + 10.1167/jov.21.9.2910 | https://doi.org/10.1167/jov.21.9.2910 | https://jov.arvojournals.org/article.aspx?articleid=2777561 + 10.1167/jov.21.9.1895 | https://doi.org/10.1167/jov.21.9.1895 | https://jov.arvojournals.org/article.aspx?articleid=2777057 + 10.1167/jov.21.9.2662 | https://doi.org/10.1167/jov.21.9.2662 | https://jov.arvojournals.org/article.aspx?articleid=2777793 + 10.1167/jov.21.9.2246 | https://doi.org/10.1167/jov.21.9.2246 | https://jov.arvojournals.org/article.aspx?articleid=2777441 + +These seem to just not be published/available yet. + +But they also use watermark.silverchair.com + +NOTE: re-crawl (force-retry?) all non-recent papers with fatcat-ingest +NOTE: for watermark.silverchair.com terminal bad-status, re-crawl from initial URL (base_url) using heritrix + +## kiss.kstudy.com + +Previously unable to download (2021-05_daily_improvements.md) + +## open.library.ubc.ca + + link_source_id | base_url | terminal_url +--------------------+------------------------------------+---------------------------------------------------------------------------------- + 10.14288/1.0400664 | https://doi.org/10.14288/1.0400664 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400664 + 10.14288/1.0401189 | https://doi.org/10.14288/1.0401189 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401189 + 10.14288/1.0401487 | https://doi.org/10.14288/1.0401487 | https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + 10.14288/1.0400994 | https://doi.org/10.14288/1.0400994 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0400994 + 10.14288/1.0401312 | https://doi.org/10.14288/1.0401312 | https://open.library.ubc.ca/collections/bcnewspapers/nelsondaily/items/1.0401312 + +Historical newspapers, out of scope? + +Video content: +https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401487 + +Another video: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 + +NOTE: add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +NOTE: handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 + + +## panor.ru + + link_source_id | base_url | terminal_url +-------------------------+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 10.33920/med-14-2108-06 | https://doi.org/10.33920/med-14-2108-06 | https://panor.ru/articles/otsenka-dinamiki-pokazateley-morfofunktsionalnykh-kharakteristik-kozhi-upatsientov-s-spr-pod-vliyaniem-kompleksnoy-fototerapii/66351.html + 10.33920/nik-02-2105-01 | https://doi.org/10.33920/nik-02-2105-01 | https://panor.ru/articles/innovatsionnost-obrazovatelnykh-tekhnologiy-kak-istoricheski-oposredovannyy-fenomen/65995.html + 10.33920/pro-1-2101-10 | https://doi.org/10.33920/pro-1-2101-10 | https://panor.ru/articles/obespechenie-bezopasnosti-na-promyshlennykh-predpriyatiyakh-s-pomoshchyu-sredstv-individualnoy-zashchity/66299.html + 10.33920/sel-4-2008-04 | https://doi.org/10.33920/sel-4-2008-04 | https://panor.ru/articles/osobennosti-regulirovaniya-zemelnykh-otnosheniy-na-prigranichnykh-territoriyakh-rossiyskoy-federatsii/66541.html + 10.33920/pro-2-2104-03 | https://doi.org/10.33920/pro-2-2104-03 | https://panor.ru/articles/organizatsiya-samorazvivayushchegosya-proizvodstva-v-realnykh-usloviyakh/65054.html + +"The full version of the article is available only to subscribers of the journal" + +Paywall + +## peerj.com + +Previously: this is HTML of reviews (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope + +## publons.com + +Previously: this is HTML (2021-05_daily_improvements.md) + +NOTE: Should be HTML ingest, possibly special case scope (length of works) + +## stm.bookpi.org + + link_source_id | base_url | terminal_url +-----------------------------+---------------------------------------------+---------------------------------------------------- + 10.9734/bpi/nfmmr/v7/11547d | https://doi.org/10.9734/bpi/nfmmr/v7/11547d | https://stm.bookpi.org/NFMMR-V7/article/view/3231 + 10.9734/bpi/ecafs/v1/9773d | https://doi.org/10.9734/bpi/ecafs/v1/9773d | https://stm.bookpi.org/ECAFS-V1/article/view/3096 + 10.9734/bpi/mpebm/v5/3391f | https://doi.org/10.9734/bpi/mpebm/v5/3391f | https://stm.bookpi.org/MPEBM-V5/article/view/3330 + 10.9734/bpi/castr/v13/3282f | https://doi.org/10.9734/bpi/castr/v13/3282f | https://stm.bookpi.org/CASTR-V13/article/view/2810 + 10.9734/bpi/hmms/v13 | https://doi.org/10.9734/bpi/hmms/v13 | https://stm.bookpi.org/HMMS-V13/issue/view/274 + +These are... just abstracts of articles within a book? Weird. Maybe sketchy? DOIs via Crossref + +## www.cabi.org + + link_source_id | base_url | terminal_url +--------------------------+------------------------------------------+---------------------------------------------------- + 10.1079/dfb/20133414742 | https://doi.org/10.1079/dfb/20133414742 | https://www.cabi.org/cabreviews/review/20133414742 + 10.1079/dmpd/20056500471 | https://doi.org/10.1079/dmpd/20056500471 | https://www.cabi.org/cabreviews/review/20056500471 + 10.1079/dmpp/20056600544 | https://doi.org/10.1079/dmpp/20056600544 | https://www.cabi.org/cabreviews/review/20056600544 + 10.1079/dmpd/20056500117 | https://doi.org/10.1079/dmpd/20056500117 | https://www.cabi.org/cabreviews/review/20056500117 + 10.1079/dmpp20056600337 | https://doi.org/10.1079/dmpp20056600337 | https://www.cabi.org/cabreviews/review/20056600337 + +Reviews? but just abstracts? + +## www.cureus.com + +-[ RECORD 1 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17547 +base_url | https://doi.org/10.7759/cureus.17547 +terminal_url | https://www.cureus.com/articles/69542-tramadol-induced-jerks +-[ RECORD 2 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16867 +base_url | https://doi.org/10.7759/cureus.16867 +terminal_url | https://www.cureus.com/articles/66793-advanced-squamous-cell-carcinoma-of-gall-bladder-masquerading-as-liver-abscess-with-review-of-literature-review-on-advanced-biliary-tract-cancer +-[ RECORD 3 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17425 +base_url | https://doi.org/10.7759/cureus.17425 +terminal_url | https://www.cureus.com/articles/67438-attitudes-and-knowledge-of-medical-students-towards-healthcare-for-lesbian-gay-bisexual-and-transgender-seniors-impact-of-a-case-based-discussion-with-facilitators-from-the-community +-[ RECORD 4 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.17313 +base_url | https://doi.org/10.7759/cureus.17313 +terminal_url | https://www.cureus.com/articles/67258-utilizing-google-trends-to-track-online-interest-in-elective-hand-surgery-during-the-covid-19-pandemic +-[ RECORD 5 ]--+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +link_source_id | 10.7759/cureus.16943 +base_url | https://doi.org/10.7759/cureus.16943 +terminal_url | https://www.cureus.com/articles/19364-small-bowel-obstruction-a-rare-presentation-of-the-inferior-pancreaticoduodenal-artery-pseudoaneurysm-bleed + +Ugh, stupid "email to get PDF". but ingest seems to work anyways? + +NOTE: re-crawl/re-ingest all (eg, fatcat-ingest or similar) + +## www.e-manuscripta.ch + + link_source_id | base_url | terminal_url +------------------------------+----------------------------------------------+------------------------------------------------------------------- + 10.7891/e-manuscripta-114031 | https://doi.org/10.7891/e-manuscripta-114031 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114031 + 10.7891/e-manuscripta-112064 | https://doi.org/10.7891/e-manuscripta-112064 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112064 + 10.7891/e-manuscripta-112176 | https://doi.org/10.7891/e-manuscripta-112176 | https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176 + 10.7891/e-manuscripta-115200 | https://doi.org/10.7891/e-manuscripta-115200 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-115200 + 10.7891/e-manuscripta-114008 | https://doi.org/10.7891/e-manuscripta-114008 | https://www.e-manuscripta.ch/swa/doi/10.7891/e-manuscripta-114008 + +Historical docs, single pages, but do have full PDF downloads. + +NOTE: re-ingest + +## www.inderscience.com + +Previously: paywall (2021-05_daily_improvements.md) + +## www.un-ilibrary.org + + link_source_id | base_url | terminal_url +----------------------------+--------------------------------------------+------------------------------------------------------------- + 10.18356/9789210550307 | https://doi.org/10.18356/9789210550307 | https://www.un-ilibrary.org/content/books/9789210550307 + 10.18356/9789210586719c011 | https://doi.org/10.18356/9789210586719c011 | https://www.un-ilibrary.org/content/books/9789210586719c011 + 10.18356/9789210058575c014 | https://doi.org/10.18356/9789210058575c014 | https://www.un-ilibrary.org/content/books/9789210058575c014 + 10.18356/9789210550307c020 | https://doi.org/10.18356/9789210550307c020 | https://www.un-ilibrary.org/content/books/9789210550307c020 + 10.18356/9789213631423c005 | https://doi.org/10.18356/9789213631423c005 | https://www.un-ilibrary.org/content/books/9789213631423c005 + +Books and chapters. Doesn't seem to have actual download ability? + +# Re-Ingest / Re-Crawl + +Using fatcat-ingest helper tool. + +- www.isca-speech.org doi_prefix:10.21437 + doi:* doi_prefix:10.21437 in_ia:false + 9,233 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.21437' > /srv/fatcat/tasks/2021-09-03_ingest_isca.json + => Counter({'ingest_request': 9221, 'elasticsearch_release': 9221, 'estimate': 9221}) +- repository.dri.ie doi_prefix:10.7486 + doi:* in_ia:false doi_prefix:10.7486 + 56,532 + ./fatcat_ingest.py --allow-non-oa query 'doi:* doi_prefix:10.7486' > /srv/fatcat/tasks/2021-09-03_ingest_dri.json + => Counter({'ingest_request': 56532, 'elasticsearch_release': 56532, 'estimate': 56532}) +- *.arvojournals.org doi_prefix:10.1167 (force recrawl if no-pdf-link) + 25,598 + many are meeting abstracts + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.1167 > /srv/fatcat/tasks/2021-09-03_ingest_arvo.json + => Counter({'ingest_request': 25598, 'elasticsearch_release': 25598, 'estimate': 25598}) +- www.cureus.com doi_prefix:10.7759 + 1,537 + ./fatcat_ingest.py --allow-non-oa query doi_prefix:10.7759 > /srv/fatcat/tasks/2021-09-03_ingest_cureus.json + => Counter({'ingest_request': 1535, 'elasticsearch_release': 1535, 'estimate': 1535}) +- www.e-manuscripta.ch doi_prefix:10.7891 10.7891/e-manuscripta + 110,945 + TODO: all are marked 'unpublished', but that is actually probably right? +- www.frontiersin.org doi_prefix:10.3389 (both PDF and XML!) + doi:* in_ia:false doi_prefix:10.3389 + 212,370 + doi:10.3389/conf.* => most seem to be just abstracts? how many like this? + container_id:kecnf6vtpngn7j2avgfpdyw5ym => "topics" (2.2k) + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 191k + but many might be components? this is actually kind of a mess + fatcat-cli search release 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' --index-json -n0 | jq '[.ident, .container_id, .doi] | @tsv' -r | rg -v 10.3389/conf | pv -l | gzip > frontiers_to_crawl.tsv.gz + => 19.2k + ./fatcat_ingest.py --allow-non-oa query 'doi:* in_ia:false doi_prefix:10.3389 !container_id:kecnf6vtpngn7j2avgfpdyw5ym !type:component stage:published' | rg -v 10.3389/conf > /srv/fatcat/tasks/2021-09-03_frontiers.json + +# Remaining Tasks / Domains (TODO) + +more complex crawling/content: +- add video link to alternative content demo ingest: https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0400764 +- watermark.silverchair.com: if terminal-bad-status, then do recrawl via heritrix with base_url +- www.morressier.com: interesting site for rich web crawling/preservation (video+slides+data) +- doi.ala.org.au: possible dataset ingest source +- peerj.com, at least reviews, should be HTML ingest? or are some PDF? +- publons.com should be HTML ingest, possibly special case for scope +- frontiersin.org: any 'component' releases with PDF file are probably a metadata bug + +other tasks: +- handle this related withdrawn notice? https://open.library.ubc.ca/cIRcle/collections/48630/items/1.0401512 +- push/deploy sandcrawler changes diff --git a/notes/ingest/2021-09-03_patch_crawl.md b/notes/ingest/2021-09-03_patch_crawl.md new file mode 100644 index 0000000..d36f427 --- /dev/null +++ b/notes/ingest/2021-09-03_patch_crawl.md @@ -0,0 +1,678 @@ + +Going to run a combined crawl for `no-capture`, `no-pdf-link` and similar URL +statuses. + +As a reminder, significant refactor of PDF URL extraction happened around +Oct/Nov 2020, so things not re-ingested since then should be retried. + +1. first bulk re-process `no-pdf-link` statuses from OAI-PMH crawl past OA DOI past crawls +2. then heritrix crawl of old URLs from all sources (see status codes below) +3. bulk ingest specific sources and statuses (see below) + +Status codes to crawl, with potentially split separate batches: + + no-capture + IA errors + cdx-error + wayback-error + wayback-content-error + petabox-error + spn2-cdx-lookup-failure + gateway-timeout + +Then, bulk ingest from these sources matching the above patterns, in this order: + +- OA DOI (fatcat-ingest or fatcat-changelog source; will result in import) +- unpaywall (will result in import) +- OAI-PMH +- MAG + +Current combined domain skip list (SQL filter syntax), for which we don't want +to bother retrying: + + '%journals.sagepub.com%' + '%pubs.acs.org%' + '%ahajournals.org%' + '%www.journal.csj.jp%' + '%aip.scitation.org%' + '%academic.oup.com%' + '%tandfonline.com%' + '%://orcid.org/%' + '%://doaj.org/%' + '%://archive.org/%' + '%://web.archive.org/%' + '%://www.archive.org/%' + +## DOI Ingest Status (2021-09-08) + +Recently did some analysis of OAI-PMH overall status, so can re-do comparisons +there easily. What about overall DOI ingest? Would like counts so we can +compare before/after. + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + no-pdf-link | 10516478 + success | 5690862 + redirect-loop | 1827192 + no-capture | 1215179 + terminal-bad-status | 650104 + link-loop | 610251 + blocked-cookie | 353681 + gateway-timeout | 341319 + too-many-redirects | 307895 + forbidden | 306710 + spn2-cdx-lookup-failure | 282955 + not-found | 273667 + cdx-error | 269082 + skip-url-blocklist | 265689 + spn2-error | 87759 + wrong-mimetype | 68993 + spn2-error:too-many-redirects | 58064 + wayback-error | 54152 + spn2-wayback-error | 51752 + remote-server-error | 45683 + (20 rows) + +## `no-pdf-link` re-try bulk ingest + +Specifically for past OAI-PMH and OA DOI crawls. + +What are top terminal domains that would be retried? So that we can filter out +large ones we don't want to bother retrying. + + SELECT domain, COUNT(domain) + FROM ( + SELECT + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + ) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 40; + + domain | count + ---------------------------------------+-------- + ssl.fao.org | 862277 + www.e-periodica.ch | 828110 + zenodo.org | 686701 + plutof.ut.ee | 685440 + www.gbif.org | 669727 + dlc.library.columbia.edu | 536018 + figshare.com | 383181 + juser.fz-juelich.de | 351519 + statisticaldatasets.data-planet.com | 320415 + espace.library.uq.edu.au | 310767 + invenio.nusl.cz | 309731 + doi.pangaea.de | 306311 + igi.indrastra.com | 297872 + bib-pubdb1.desy.de | 273565 + t2r2.star.titech.ac.jp | 271907 + digi.ub.uni-heidelberg.de | 265519 + www.sciencedirect.com | 263847 + publikationen.bibliothek.kit.edu | 229960 + www.plate-archive.org | 209231 + www.degruyter.com | 189776 + spectradspace.lib.imperial.ac.uk:8443 | 187086 + hal.archives-ouvertes.fr | 185513 + open.library.ubc.ca | 172821 + lup.lub.lu.se | 170063 + books.openedition.org | 169501 + orbi.uliege.be | 161443 + freidok.uni-freiburg.de | 150310 + library.wur.nl | 124318 + digital.library.pitt.edu | 116406 + www.research.manchester.ac.uk | 115869 + www.bibliotecavirtualdeandalucia.es | 114527 + repository.tue.nl | 112157 + www.google.com | 111569 + easy.dans.knaw.nl | 109608 + springernature.figshare.com | 108597 + nbn-resolving.org | 107544 + scholarbank.nus.edu.sg | 107299 + bibliotecavirtualdefensa.es | 105501 + biblio.ugent.be | 100854 + ruj.uj.edu.pl | 99500 + (40 rows) + +For a number of these domains, we do not expect any PDFs to be found, but are +going to re-ingest anyways so they get marked as 'blocked-*' in result table: + +- ssl.fao.org +- plutof.ut.ee +- www.gbif.org + +But some we are just going to skip anyways, because there *could* be PDFs, but +probably *aren't*: + +- zenodo.org +- t2r2.star.titech.ac.jp +- www.google.com +- figshare.com +- springernature.figshare.com + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json'; + => COPY 18040676 + +Transform and start ingest: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json + => 18.0M 0:06:45 [44.5k/s] + + cat /srv/sandcrawler/tasks/patch_ingest_request_2021-09-08.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE + +## Progress Check + +OAI-PMH query: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 13258356 + no-pdf-link | 8685519 + no-capture | 4765663 + redirect-loop | 1557731 + terminal-bad-status | 803373 + link-loop | 453999 + wrong-mimetype | 440230 + null-body | 71457 + cdx-error | 18426 + | 15275 + petabox-error | 13408 + wayback-error | 11845 + blocked-cookie | 11580 + skip-url-blocklist | 7761 + wayback-content-error | 383 + spn2-cdx-lookup-failure | 362 + gateway-timeout | 320 + body-too-large | 207 + spn2-error:job-failed | 191 + redirects-exceeded | 120 + (20 rows) + +OAI-PMH compared to a couple weeks ago: + + 13258356-12872279 = +386,077 success + 8685519-9329602 = -644,083 no-pdf-link + 4765663-4696362 = +69,301 no-capture + 803373-660418 = +142,955 terminal-bad-status + +OA DOI ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------+--------- + no-pdf-link | 6693547 + success | 5979016 + skip-url-blocklist | 3080986 + no-capture | 1876914 + redirect-loop | 1872817 + terminal-bad-status | 656674 + link-loop | 624290 + blocked-cookie | 448001 + gateway-timeout | 351896 + too-many-redirects | 307895 + forbidden | 306710 + spn2-cdx-lookup-failure | 301312 + cdx-error | 279766 + not-found | 273667 + wrong-mimetype | 83289 + spn2-error | 76806 + spn2-error:too-many-redirects | 58064 + wayback-error | 54278 + spn2-wayback-error | 51768 + remote-server-error | 45683 + (20 rows) + +OA DOI changes: + + 5979016-5690862 = +288,154 success + 6693547-10516478 = -3,822,931 no-pdf-link (still many!) + 1876914-1215179 = +661,735 no-capture + 3080986-265689 = +2,815,297 skip-url-blocklist + +Overall about half a million new 'success', pretty good. over 750k new +no-capture for crawling. + +## Seedlist Dumps + +Note that this is just seedlists, not full ingest requests. + + COPY ( + SELECT ingest_file_result.terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + ) TO '/srv/sandcrawler/tasks/patch_2021-09-16_terminal_seedlist.txt'; + => 6,354,365 + +Then run the actual patch crawl! + +## Ingest Requests for Bulk Retry (2022-01-06) + +Crawl has just about completed, so running another round of bulk ingest +requests, slightly updated to allow `https://doi.org/10*` in terminal URL: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.updated <= '2022-01-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + ) + AND ( + ingest_request.link_source = 'oai' + OR ( + ingest_request.link_source = 'doi' + AND ( + ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-changelog' + ) + ) + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json'; + => 4,488,193 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json + => DONE + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-01-06.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => TIMEDOUT + => (probably due to re-assignment) + => DONE + +## Stats Again (just OAI-PMH) + +OAI-PMH query: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.base_url NOT LIKE '%www.kb.dk%' + AND ingest_request.base_url NOT LIKE '%kb-images.kb.dk%' + AND ingest_request.base_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_request.base_url NOT LIKE '%aggr.ukm.um.si%' + AND ingest_request.base_url NOT LIKE '%edoc.mpg.de%' + AND ingest_request.base_url NOT LIKE '%doaj.org%' + AND ingest_request.base_url NOT LIKE '%orcid.org%' + AND ingest_request.base_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + +On 2022-02-08: + + status | count + -----------------------+---------- + success | 13505143 + no-pdf-link | 8741007 + no-capture | 4429986 + redirect-loop | 1566611 + terminal-bad-status | 816162 + link-loop | 459006 + wrong-mimetype | 448983 + null-body | 71871 + cdx-error | 19055 + | 15275 + petabox-error | 11713 + blocked-cookie | 11664 + wayback-error | 8745 + skip-url-blocklist | 7828 + max-hops-exceeded | 2031 + wayback-content-error | 338 + body-too-large | 280 + spn2-error:job-failed | 191 + bad-redirect | 134 + redirects-exceeded | 120 + (20 rows) + + +On 2022-02-28, after bulk ingest completed: + + status | count + -----------------------+---------- + success | 14668123 + no-pdf-link | 8822460 + no-capture | 2987565 + redirect-loop | 1629015 + terminal-bad-status | 917851 + wrong-mimetype | 466512 + link-loop | 460941 + null-body | 71457 + cdx-error | 19636 + petabox-error | 16198 + | 15275 + blocked-cookie | 11885 + wayback-error | 8779 + skip-url-blocklist | 7838 + empty-blob | 5906 + max-hops-exceeded | 5563 + wayback-content-error | 355 + body-too-large | 329 + spn2-error:job-failed | 191 + bad-redirect | 137 + (20 rows) + + +Comparing to a couple months ago: + + 14668123-13258356 = +1,409,767 success + 8822460-8685519 = + 136,941 no-pdf-link + 2987565-4765663 = -1,778,098 no-capture + 917851-803373 = + 114,478 terminal-bad-status + diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md new file mode 100644 index 0000000..786c3b2 --- /dev/null +++ b/notes/ingest/2021-12-13_datasets.md @@ -0,0 +1,504 @@ + +First round of production dataset ingest. Aiming to get one or two small +repositories entirely covered, and a few thousand datasets from all supported +platforms. + +Planning to run with sandcrawler in batch mode on `wbgrp-svc263`, expecting up +to a TByte of content locally (on spinning disk). For successful output, will +run through fatcat import; for a subset of unsuccessful, will start a small +heritrix crawl. + + +## Ingest Generation + +Summary: + + wc -l /srv/fatcat/tasks/ingest_dataset_*pilot.json + 2 /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + 1702 /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + 2975 /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + 10000 /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +All the below ingest requests were combined into a single large file: + + cat /srv/fatcat/tasks/ingest_dataset*pilot.json | shuf | pv -l | gzip > /srv/fatcat/tasks/ingest_dataset_combined.json.gz + # 24.7k 0:00:00 [91.9k/s] + +### Figshare + +- sample 10k datasets (not other types) +- want only "versioned" DOIs; use regex on DOI to ensure + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.6084 type:dataset' \ + | rg '10\.6084/m9\.figshare\.\d+.v\d+' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_figshare_pilot.json + # Counter({'estimate': 505968, 'ingest_request': 50000, 'elasticsearch_release': 50000}) + +### Zenodo + +- has DOIs (of course) +- want only "versioned" DOIs? how to skip? +- sample 10k + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.5281 type:dataset' \ + | rg '10\.5281/zenodo' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_zenodo_pilot.json + +### Goettingen Research Online + +- <https://data.goettingen-research-online.de/> +- Dataverse instance, not harvard-hosted +- ~1,400 datasets, ~10,500 files +- has DOIs +- `doi_prefix:10.25625`, then filter to only one slash + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query 'doi_prefix:10.25625 type:dataset' \ + | rg -v '10\.25625/[a-z0-9]+/[a-z0-9]' \ + | shuf \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_goettingen_pilot.json + # Counter({'ingest_request': 12739, 'elasticsearch_release': 12739, 'estimate': 12739}) # 1.7k 0:01:29 [ 19 /s] + +### Harvard Dataverse + +- main harvard dataverse instance, many "sub-dataverses" +- ~137,000 datasets, ~1,400,000 files +- 10k sample + + ./fatcat_ingest.py --limit 50000 --ingest-type dataset --allow-non-oa query 'doi_prefix:10.7910 type:dataset' \ + | rg '10\.7910/dvn/[a-z0-9]{6}' \ + | rg -v '10\.7910/dvn/[a-z0-9]{6}/[a-z0-9]' \ + | shuf -n10000 \ + | pv -l \ + > /srv/fatcat/tasks/ingest_dataset_dataverse_harvard_pilot.json + # Counter({'estimate': 660979, 'ingest_request': 50000, 'elasticsearch_release': 50000}) # 2.97k 0:03:26 [14.4 /s] + +Note that this was fewer than expected, but moving on anyways. + +### archive.org + +A couple hand-filtered items. + +"CAT" dataset +- item: <https://archive.org/details/CAT_DATASET> +- fatcat release (for paper): `release_36vy7s5gtba67fmyxlmijpsaui` + +"The Representativeness of Automated Web Crawls as a Surrogate for Human Browsing" +- https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62 +- https://fatcat.wiki/release/7owybd2hrvdmdpm4zpo7hkn2pu (paper) + + + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/CAT_DATASET", + "release_stage": "published", + "fatcat": { + "release_ident": "36vy7s5gtba67fmyxlmijpsaui", + "work_ident": "ycqtbhnfmzamheq2amztiwbsri" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "36vy7s5gtba67fmyxlmijpsaui" + } + { + "ingest_type": "dataset", + "ingest_request_source": "savepapernow", + "base_url": "https://archive.org/details/academictorrents_5e9ef2b5531ce3b965681be6eccab1fbd114af62", + "release_stage": "published", + "fatcat": { + "release_ident": "7owybd2hrvdmdpm4zpo7hkn2pu", + "work_ident": "3xkz7iffwbdfhbwhnd73iu66cu" + }, + "ext_ids": {}, + "link_source": "spn", + "link_source_id": "7owybd2hrvdmdpm4zpo7hkn2pu" + } + + # paste and then Ctrl-D: + cat | jq . -c > /srv/fatcat/tasks/ingest_dataset_dataverse_archiveorg_pilot.json + + +## Ingest Command + +On `wbgrp-svc263`. + +In the current version of tool, `skip_cleanup_local_files=True` by default, so +files will stick around. + +Note that `--no-spn2` is passed, so we are expecting a lot of `no-capture` in the output. + + + # first a small sample + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | head -n5 \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.ramp.json + + # ok, run the whole batch through + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | pv -l \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 - \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.json + +Got an error: + + internetarchive.exceptions.AuthenticationError: No access_key or secret_key set! Have you run `ia configure`? + +Did a hot patch to try to have the uploads happen under a session, with config from ENV, but didn't work: + + AttributeError: 'ArchiveSession' object has no attribute 'upload' + +Going to hack with config in homedir for now. + +Extract URLs for crawling: + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.base_url.txt + + cat /srv/sandcrawler/tasks/ingest_dataset_combined_results*.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist.manifest_terminal.txt + +### Exceptions Encountered + + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 193, in process + internetarchive.upload + [...] + ConnectionResetError: [Errno 104] Connection reset by peer + urllib3.exceptions.ProtocolError + requests.exceptions.ConnectionError: (ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), 'https://s3.us.archive.org/zenodo.org-3275525/rhOverM_Asymptotic_GeometricUnits_CoM.h5') + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 375, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 130, in process + r.raise_for_status() + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/requests/models.py", line 953, in raise_for_status + raise HTTPError(http_error_msg, response=self) + requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://ndownloader.figshare.com/files/5474201 + +download sometimes just slowly time out, like after a day or more + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 381, in process + archive_result = strategy_helper.process(dataset_meta) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_strategies.py", line 155, in process + file_meta = gen_file_metadata_path(local_path, allow_empty=True) + File "/srv/sandcrawler/src/python/sandcrawler/misc.py", line 89, in gen_file_metadata_path + mimetype = magic.Magic(mime=True).from_file(path) + File "/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/magic/__init__.py", line 111, in from_file + with _real_open(filename): + FileNotFoundError: [Errno 2] No such file or directory: '/tmp/sandcrawler/figshare.com-7925396-v1/HG02070.dedup.realigned.recalibrated.hc.g.vcf.gz' + + + Traceback (most recent call last): + File "./ingest_tool.py", line 208, in <module> + main() + File "./ingest_tool.py", line 204, in main + args.func(args) + File "./ingest_tool.py", line 57, in run_requests + result = fileset_worker.process(request) + File "/srv/sandcrawler/src/python/sandcrawler/ingest_fileset.py", line 314, in process + dataset_meta = platform_helper.process_request(request, resource, html_biblio) + File "/srv/sandcrawler/src/python/sandcrawler/fileset_platforms.py", line 208, in process_request + obj_latest = obj["data"]["latestVersion"] + KeyError: 'latestVersion' + +Fixed the above, trying again: + + git log | head -n1 + # commit ffdc901fa067db55fe6cfeb8d0c3807d29df092c + + Wed Dec 15 21:57:42 UTC 2021 + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results4.json + +Zenodo seems really slow, let's try filtering those out: + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results5.json + # 3.76k 15:12:53 [68.7m/s] + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | rg -v 10.5281 \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results6.json + +## Fatcat Import + + wc -l ingest_dataset_combined_results*.json + 126 ingest_dataset_combined_results2.json + 153 ingest_dataset_combined_results3.json + 275 ingest_dataset_combined_results4.json + 3762 ingest_dataset_combined_results5.json + 7736 ingest_dataset_combined_results6.json + 182 ingest_dataset_combined_results.json + 5 ingest_dataset_combined_results.ramp.json + 12239 total + + cat ingest_dataset_combined_results*.json \ + | rg '^\{' \ + | jq '[.request.fatcat.release_ident, . | tostring] | @tsv' -r \ + | sort \ + | uniq --check-chars 26 \ + | cut -f2 \ + | rg -v '\\\\' \ + | pv -l \ + > uniq_ingest_dataset_combined_results.json + # 9.48k 0:00:06 [1.54k/s] + + cat uniq_ingest_dataset_combined_results.json | jq .status -r | sort | uniq -c | sort -nr + 7941 no-capture + 374 platform-404 + 369 terminal-bad-status + 348 success-file + 172 success + 79 platform-scope + 77 error-platform-download + 47 empty-manifest + 27 platform-restricted + 20 too-many-files + 12 redirect-loop + 6 error-archiveorg-upload + 3 too-large-size + 3 mismatch + 1 no-platform-match + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success") | .' -c \ + > uniq_ingest_dataset_combined_results.success.json + + cat uniq_ingest_dataset_combined_results.json \ + | rg '"success' \ + | jq 'select(.status == "success-file") | .' -c \ + > uniq_ingest_dataset_combined_results.success-file.json + +On fatcat QA instance: + + git log | head -n1 + # commit cca680e2cc4768a4d45e199f6256a433b25b4075 + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /tmp/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-ingest-type': 10, 'insert': 0, 'update': 0, 'exists': 0}) + +Need to update fatcat file worker to support single-file filesets... was that the plan? + + head /tmp/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-no-access-url': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + # Counter({'total': 10, 'insert': 10, 'skip': 0, 'update': 0, 'exists': 0}) + +Trying again 2022-03-23: + + git log | head -n1 + # commit 134cb050988be2c545af89e0a67c4998307bb819 + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'skip': 10, 'skip-single-file': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'skip': 10, 'skip-status': 10, 'insert': 0, 'update': 0, 'exists': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 10, 'exists': 10, 'skip': 0, 'insert': 0, 'update': 0}) + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'skip': 20, 'skip-release-has-fileset': 20, 'exists': 10, 'insert': 0, 'update': 0}) + + head -n200 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 172, 'skip': 162, 'skip-release-has-fileset': 162, 'exists': 10, 'insert': 0, 'update': 0}) + + head /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success-file.json \ + | ./fatcat_import.py ingest-fileset-file-results - + # Counter({'total': 10, 'insert': 8, 'skip': 2, 'skip-bad-hashes': 2, 'update': 0, 'exists': 0}) + +Fixed a small logic error in insert path. + + head -n30 /srv/fatcat/datasets/uniq_ingest_dataset_combined_results.success.json \ + | ./fatcat_import.py ingest-fileset-results - + # Counter({'total': 30, 'insert': 20, 'exists': 10, 'skip': 0, 'update': 0}) + +archive.org datasets are *not* getting uploaded with the correct path. path +directory prefixes are getting clobbered. + +## Summary + +As follow-up, it may be worth doing another manual round of ingest requests. +After that, would be good to fill in "glue" code so that this can be done with +kafka workers, and do re-tries/dumps using sandcrawler SQL database. Then can +start scaling up more ingest, using ingest tool, "bulk mode" processing, +heritrix crawls from `no-capture` dumps, etc, similar to bulk file ingest +process. + +For scaling, let's do a "full" ingest request generation of all datasets, and +crawl the base URL with heritrix, in fast/direct mode. Expect this to be tens +of millions of mostly DOIs (doi.org URLs), should crawl quickly. + +Then, do bulk downloading with ingest worker, perhaps on misc-vm or aitio. +uploading large datasets to archive.org, but not doing SPN web requests. Feed +the resulting huge file seedlist into a heritrix crawl to download web files. + +Will need to add support for more specific platforms. + + +### Huge Bulk Ingest Prep + +On prod instance: + + ./fatcat_ingest.py --ingest-type dataset --allow-non-oa query type:dataset \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_dataset_bulk.2022-01-05.json.gz + # Expecting 11264787 release objects in search queries + # TIMEOUT ERROR + # 6.07M 19:13:02 [87.7 /s] (partial) + +As follow-up, should do a full batch (not partial). For now search index is too +unreliable (read timeouts). + + zcat ingest_dataset_bulk.2022-01-05.partial.json.gz \ + | jq .base_url -r \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > ingest_dataset_bulk.2022-01-05.partial.schedule + +## Retries (2022-01-12) + +This is after having done a bunch of crawling. + + cat ingest_dataset_combined_results6.json \ + | rg '"no-capture"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request -c \ + | pv -l \ + > ingest_dataset_retry.json + => 6.51k 0:00:01 [3.55k/s] + + cat /srv/sandcrawler/tasks/ingest_dataset_retry.json \ + | parallel -j4 --linebuffer --round-robin --pipe ./ingest_tool.py requests --no-spn2 --enable-sentry - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json + +## Retries (2022-02) + +Finally got things to complete end to end for this batch! + + cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr + 3220 terminal-bad-status + 2120 no-capture + 380 empty-manifest + 264 success-file + 251 success + 126 success-existing + 39 mismatch + 28 error-platform-download + 24 too-many-files + 20 platform-scope + 13 platform-restricted + 13 mismatch-size + 6 too-large-size + 3 transfer-encoding-error + 2 no-platform-match + 2 error-archiveorg-upload + 1 redirect-loop + 1 empty-blob + +Some more URLs to crawl: + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt + # 1.00 + # just a single DOI that failed to crawl, for whatever reason + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt + +These are ready to crawl, in the existing dataset crawl. + + cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule + +## Running Uploads Again + +Looks like the temporary download files got wiped on `wbgrp-svc263`. This is a +big bummer! Will need to download many of these over again. + + # sandcrawler git: c69a8dadb0426fec10fe38474c2f37ceaebdf316 + # skip_cleanup_local_files=True is still default + + zcat /srv/sandcrawler/tasks/ingest_dataset_combined.json.gz \ + | shuf \ + | parallel -j8 --linebuffer --round-robin --pipe ./ingest_tool.py --enable-sentry requests --no-spn2 - \ + | pv -l \ + > /srv/sandcrawler/tasks/ingest_dataset_combined_results.2022-04-04.json + + # filter out zenodo, very slow: + # rg -v 10.5281 \ diff --git a/notes/ingest/2022-01-06_patch_crawl.md b/notes/ingest/2022-01-06_patch_crawl.md new file mode 100644 index 0000000..941519f --- /dev/null +++ b/notes/ingest/2022-01-06_patch_crawl.md @@ -0,0 +1,398 @@ + +Starting another paper fulltext patch crawl, targetting recent OA content which +has failed to ingest, and platforms (arxiv, etc). + +Specifically: + +- "daily" changelog ingest requests from all time, which failed with various status codes +- pdf no-capture +- SPN errors +- terminal-bad-status with 5xx, 429 +- gateway-timeout +- html no-capture +- html-resource-no-capture + +Most of these are dumped in a single complex query (below), + +TODO: html-resource-no-capture (from error message? or do SPN requests separately?) + + +## Initial 'no-capture' Seedlist + +Dump terminal URLs (will do ingest requests later, using similar command): + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-01-12.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-01-12.txt'; + => COPY 6389683 + +TODO: filter out archive.org/www.archive.org + + cat patch_terminal_url.2022-01-12.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-01-12.uniq.txt + => 5.73M 0:00:47 [ 120k/s] + + # note: tweaks and re-ran the above after inspecting this output + cut -f3 -d/ patch_terminal_url.2022-01-12.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 799045 doi.org + 317557 linkinghub.elsevier.com + 211091 arxiv.org + 204334 iopscience.iop.org + 139758 dialnet.unirioja.es + 130331 www.scielo.br + 124626 www.persee.fr + 85764 digitalrepository.unm.edu + 83913 www.mdpi.com + 79662 www.degruyter.com + 75703 www.e-periodica.ch + 72206 dx.doi.org + 69068 escholarship.org + 67848 idus.us.es + 57907 zenodo.org + 56624 ir.opt.ac.cn + 54983 projecteuclid.org + 52226 rep.bntu.by + 48376 osf.io + 48009 pubs.rsc.org + 46947 publikationen.ub.uni-frankfurt.de + 45564 www.research-collection.ethz.ch + 45153 dk.um.si + 43313 www.ssoar.info + 40543 scholarworks.umt.edu + +TODO: cleanup ingest request table in sandcrawler-db: +- remove filtered OAI-PMH prefixes +- remove any invalid `base_url` (?) + +## More Seedlist (2022-02-08) + + COPY ( + SELECT ingest_file_result.terminal_url + -- SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'oai' + OR ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + -- ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-08.rows.json'; + ) TO '/srv/sandcrawler/tasks/patch_terminal_url.2022-02-08.txt'; + => COPY 444764 + + cat patch_terminal_url.2022-02-08.txt \ + | rg -v www.archive.org \ + | rg '://' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | rg -i '^http' \ + | sort -u -S 4G \ + | pv -l \ + > patch_terminal_url.2022-02-08.uniq.txt + => 426k 0:00:04 [ 103k/s] + + cut -f3 -d/ patch_terminal_url.2022-02-08.uniq.txt | sort | uniq -c | sort -nr | head -n25 + 60123 www.degruyter.com + 59314 arxiv.org + 43674 zenodo.org + 17771 doi.org + 9501 linkinghub.elsevier.com + 9379 www.mdpi.com + 5691 opendata.uni-halle.de + 5578 scholarlypublishingcollective.org + 5451 era.library.ualberta.ca + 4982 www.cairn.info + 4306 www.taylorfrancis.com + 4189 papers.ssrn.com + 4157 apps.crossref.org + 4089 www.sciencedirect.com + 4033 mdpi-res.com + 3763 dlc.mpg.de + 3408 osf.io + 2603 www.frontiersin.org + 2594 watermark.silverchair.com + 2569 journals.lww.com + 1787 underline.io + 1680 archiviostorico.fondazione1563.it + 1658 www.jstage.jst.go.jp + 1611 cyberleninka.ru + 1535 www.schoeningh.de + + cat patch_terminal_url.2022-02-08.txt | awk '{print "F+ " $1}' > patch_terminal_url.2022-02-08.schedule + => Done + +Copied to crawler svc206 and added to frontier. + + +## Bulk Ingest Requests (2022-02-28) + +Note that we are skipping OAI-PMH here, because we just did a separate ingest +for those. + +This is going to dump many duplicate lines (same `base_url`, multiple +requests), but that is fine. Expecting something like 7 million rows. + + COPY ( + -- SELECT ingest_file_result.terminal_url + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + AND ingest_file_result.updated <= '2022-02-08' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + -- ingest_request.link_source = 'oai' + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + OR ingest_request.link_source = 'pmc' + ) + + AND ingest_request.link_source_id NOT LIKE 'oai:kb.dk:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bdr.oai.bsb-muenchen.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hispana.mcu.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bnf.fr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:ukm.si:%' + AND ingest_request.link_source_id NOT LIKE 'oai:biodiversitylibrary.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hsp.org:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repec:%' + AND ingest_request.link_source_id NOT LIKE 'oai:n/a:%' + AND ingest_request.link_source_id NOT LIKE 'oai:quod.lib.umich.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:americanae.aecid.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:www.irgrid.ac.cn:%' + AND ingest_request.link_source_id NOT LIKE 'oai:espace.library.uq.edu:%' + AND ingest_request.link_source_id NOT LIKE 'oai:edoc.mpg.de:%' + AND ingest_request.link_source_id NOT LIKE 'oai:bibliotecadigital.jcyl.es:%' + AND ingest_request.link_source_id NOT LIKE 'oai:repository.erciyes.edu.tr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:krm.or.kr:%' + AND ingest_request.link_source_id NOT LIKE 'oai:hypotheses.org:%' + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json'; + # COPY 3053219 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json + => DONE + + cat /srv/sandcrawler/tasks/patch_ingest_request_2022-02-28.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + diff --git a/notes/ingest/2022-01-13_doi_crawl.md b/notes/ingest/2022-01-13_doi_crawl.md new file mode 100644 index 0000000..a6f08dd --- /dev/null +++ b/notes/ingest/2022-01-13_doi_crawl.md @@ -0,0 +1,248 @@ + +Could roll this in to current patch crawl instead of starting a new crawl from scratch. + +This file is misnamed; these are mostly non-DOI-specific small updates. + +## KBART "almost complete" experimentation + +Random 10 releases: + + cat missing_releases.json | shuf -n10 | jq .ident -r | awk '{print "https://fatcat.wiki/release/" $1}' + https://fatcat.wiki/release/suggmo4fnfaave64frttaqqoja - domain gone + https://fatcat.wiki/release/uw2dq2p3mzgolk4alze2smv7bi - DOAJ, then OJS PDF link. sandcrawler failed, fixed + https://fatcat.wiki/release/fjamhzxxdndq5dcariobxvxu3u - OJS; sandcrawler fix works + https://fatcat.wiki/release/z3ubnko5ifcnbhhlegc24kya2u - OJS; sandcrawler failed, fixed (separate pattern) + https://fatcat.wiki/release/pysc3w2cdbehvffbyca4aqex3i - DOAJ, OJS bilingual, failed with 'redirect-loop'. force re-crawl worked for one copy + https://fatcat.wiki/release/am2m5agvjrbvnkstke3o3xtney - not attempted previously (?), success + https://fatcat.wiki/release/4zer6m56zvh6fd3ukpypdu7ita - cover page of journal (not an article). via crossref + https://fatcat.wiki/release/6njc4rdaifbg5jye3bbfdhkbsu - OJS; success + https://fatcat.wiki/release/jnmip3z7xjfsdfeex4piveshvu - OJS; not crawled previously; success + https://fatcat.wiki/release/wjxxcknnpjgtnpbzhzge6rkndi - no-pdf-link, fixed + +Try some more! + + https://fatcat.wiki/release/ywidvbhtfbettmfj7giu2htbdm - not attempted, success + https://fatcat.wiki/release/ou2kqv5k3rbk7iowfohpitelfa - OJS, not attempted, success? + https://fatcat.wiki/release/gv2glplmofeqrlrvfs524v5qa4 - scirp.org; 'redirect-loop'; HTML/PDF/XML all available; then 'gateway-timeout' on retry + https://fatcat.wiki/release/5r5wruxyyrf6jneorux3negwpe - gavinpublishers.com; broken site + https://fatcat.wiki/release/qk4atst6svg4hb73jdwacjcacu - horyzonty.ignatianum.edu.pl; broken DOI + https://fatcat.wiki/release/mp5ec3ycrjauxeve4n4weq7kqm - old cert; OJS; success + https://fatcat.wiki/release/sqnovcsmizckjdlwg3hipxrfqm - not attempted, success + https://fatcat.wiki/release/42ruewjuvbblxgnek6fpj5lp5m - OJS URL, but domain broken + https://fatcat.wiki/release/crg6aiypx5enveldvmwy5judp4 - volume/cover (stub) + https://fatcat.wiki/release/jzih3vvxj5ctxk3tbzyn5kokha - success + + +## Seeds: fixed OJS URLs + +Made some recent changes to sandcrawler, should re-attempt OJS URLs, particularly from DOI or DOAJ, with pattern like: + +- `no-pdf-link` with terminal URL like `/article/view/` +- `redirect-loop` with terminal URL like `/article/view/` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json'; + => COPY 326577 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.rows.json > /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json + cat /srv/sandcrawler/tasks/retry_ojs_nopdflink.2022-01-13.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Done/running. + + COPY ( + SELECT ingest_file_result.terminal_url + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ( + ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'link-loop' + ) + AND ( + ingest_file_result.terminal_url LIKE '%/article/view/%' + OR ingest_file_result.terminal_url LIKE '%/article/download/%' + ) + ) TO '/srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt'; + => COPY 342415 + + cat /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/retry_ojs_loop.2022-01-13.schedule + +Done/seeded. + +## Seeds: scitemed.com + +Batch retry sandcrawler `no-pdf-link` with terminal URL like: `scitemed.com/article` + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_file_result.status = 'no-pdf-link' + AND ingest_file_result.terminal_url LIKE '%/article/view/%' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_scitemed.2022-01-13.rows.json'; + # SKIPPED + +Actually there are very few of these. + +## Seeds: non-OA paper DOIs + +There are many DOIs out there which are likely to be from small publishers, on +the web, and would ingest just fine (eg, in OJS). + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' --count + 30,938,106 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'preservation:none' --count + 6,664,347 + + fatcat-cli search release in_ia:false is_oa:false 'doi:*' release_type:article-journal 'container_id:*' '!publisher_type:big5' 'in_kbart:false' --count + 8,258,111 + +Do the 8 million first, then maybe try the 30.9 million later? Do sampling to +see how many are actually accessible? From experience with KBART generation, +many of these are likely to crawl successfully. + + ./fatcat_ingest.py --ingest-type pdf --allow-non-oa query 'in_ia:false is_oa:false doi:* release_type:article-journal container_id:* !publisher_type:big5 in_kbart:false' \ + | pv -l \ + | gzip \ + > /srv/fatcat/tasks/ingest_nonoa_doi.json.gz + # re-running 2022-02-08 after this VM was upgraded + # Expecting 8321448 release objects in search queries + # DONE + +This is large enough that it will probably be a bulk ingest, and then probably +a follow-up crawl. + +## Seeds: HTML and XML links from HTML biblio + + kafkacat -C -b wbgrp-svc284.us.archive.org:9092 -t sandcrawler-prod.ingest-file-results -e \ + | pv -l \ + | rg '"(html|xml)_fulltext_url"' \ + | rg '"no-pdf-link"' \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.json.gz + + # cut this off at some point? gzip is terminated weird + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz | wc -l + # gzip: ingest_file_result_fulltext_urls.2022-01-13.json.gz: unexpected end of file + # 2,538,433 + +Prepare seedlists (to include in heritrix patch crawl): + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \ + | jq .html_biblio.xml_fulltext_url -r \ + | rg '://' \ + | sort -u -S 4G \ + | pv -l \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz + # 1.24M 0:01:35 [12.9k/s] + + zcat ingest_file_result_fulltext_urls.2022-01-13.json.gz \ + | jq .html_biblio.html_fulltext_url -r \ + | rg '://' \ + | sort -u -S 4G \ + | pv -l \ + | gzip \ + > ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz + # 549k 0:01:27 [6.31k/s] + + zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \ + | cut -f3 -d/ \ + | sort -S 4G \ + | uniq -c \ + | sort -nr \ + | head -n20 + + 534005 dlc.library.columbia.edu + 355319 www.degruyter.com + 196421 zenodo.org + 101450 serval.unil.ch + 100631 biblio.ugent.be + 47986 digi.ub.uni-heidelberg.de + 39187 www.emerald.com + 33195 www.cairn.info + 25703 boris.unibe.ch + 19516 journals.openedition.org + 15911 academic.oup.com + 11091 repository.dl.itc.u-tokyo.ac.jp + 9847 oxfordworldsclassics.com + 9698 www.thieme-connect.de + 9552 www.idunn.no + 9265 www.zora.uzh.ch + 8030 www.scielo.br + 6543 www.hanspub.org + 6229 asmedigitalcollection.asme.org + 5651 brill.com + + zcat ingest_file_result_fulltext_urls.2022-01-13.xml_urls.txt.gz ingest_file_result_fulltext_urls.2022-01-13.html_urls.txt.gz \ + | awk '{print "F+ " $1}' \ + > ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + + wc -l ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + 1785901 ingest_file_result_fulltext_urls.2022-01-13.xml_and_html.schedule + +Added to `JOURNALS-PATCH-CRAWL-2022-01` + +## Seeds: most doi.org terminal non-success + +Unless it is a 404, should retry. + +TODO: generate this list + +## Non-OA DOI Bulk Ingest + +Had previously run: + + cat ingest_nonoa_doi.json.gz \ + | rg -v "doi.org/10.2139/" \ + | rg -v "doi.org/10.1021/" \ + | rg -v "doi.org/10.1121/" \ + | rg -v "doi.org/10.1515/" \ + | rg -v "doi.org/10.1093/" \ + | rg -v "europepmc.org" \ + | pv -l \ + | gzip \ + > nonoa_doi.filtered.ingests.json.gz + # 7.35M 0:01:13 [99.8k/s] + +Starting a bulk ingest of these on 2022-03-18, which is *before* the crawl has +entirely finished, but after almost all queues (domains) have been done for +several days. + + zcat nonoa_doi.filtered.ingests.json.gz \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Looks like many jstage `no-capture` status; these are still (slowly) crawling. diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md new file mode 100644 index 0000000..9722459 --- /dev/null +++ b/notes/ingest/2022-03_doaj.md @@ -0,0 +1,278 @@ + +plan: +- usual setup and dump ingest requests +- filter ingest requests to targetted ccTLDs, and add those to crawl first + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz' + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz + # 9.08M 0:37:38 [4.02k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373}) + + +## Check Pre-Crawl Status + +2022-03-09, before the above load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 2919808 + html | wrong-scope | 1098998 + pdf | no-pdf-link | 481532 + pdf | redirect-loop | 429006 + html | success | 342501 + html | unknown-scope | 225390 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187762 + html | no-capture | 185418 + pdf | no-capture | 171273 + pdf | null-body | 129028 + html | null-body | 100296 + pdf | terminal-bad-status | 91551 + pdf | link-loop | 25447 + html | wrong-mimetype | 22640 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + pdf | wrong-mimetype | 7688 + xml | success | 6897 + html | petabox-error | 5529 + pdf | wayback-error | 2706 + xml | null-body | 2353 + pdf | | 2063 + pdf | wayback-content-error | 1349 + html | cdx-error | 1169 + pdf | cdx-error | 1130 + pdf | petabox-error | 679 + html | | 620 + pdf | empty-blob | 562 + html | blocked-cookie | 545 + (30 rows) + +After the above load: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3036457 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108132 + pdf | no-pdf-link | 485703 + pdf | redirect-loop | 436085 + html | success | 342594 + html | unknown-scope | 225412 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187999 + html | no-capture | 187310 + pdf | no-capture | 172033 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91799 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6897 + html | petabox-error | 5530 + pdf | wayback-error | 2707 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 771 + pdf | empty-blob | 562 + (30 rows) + +Dump ingest requests for crawling (or bulk ingest first?): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json'; + => COPY 353819 + +Not that many! Guess the filters are important? + + SELECT COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ); + => 3202164 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json + => 353k 0:00:16 [21.0k/s] + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Dump seeds again (for crawling): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json'; + # COPY 350661 + +And stats again: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3037059 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108476 + pdf | no-pdf-link | 485705 + pdf | redirect-loop | 436850 + html | success | 342762 + html | unknown-scope | 225412 + html | redirect-loop | 224683 + html | html-resource-no-capture | 188058 + html | no-capture | 185734 + pdf | no-capture | 170452 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91875 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19042 + html | terminal-bad-status | 13333 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6898 + html | petabox-error | 5535 + pdf | wayback-error | 2711 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 772 + html | blocked-cookie | 769 + (30 rows) + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json + +Create seedlist: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | jq -r .base_url \ + | sort -u -S 4G \ + > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt + +Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will +re-ingest when that completes (a week or two?). + + +## Bulk Ingest + +After `TARGETED-ARTICLE-CRAWL-2022-03` wrap-up. + + # 2022-03-22 + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/notes/ingest/2022-03_oaipmh.md b/notes/ingest/2022-03_oaipmh.md new file mode 100644 index 0000000..d2a8d71 --- /dev/null +++ b/notes/ingest/2022-03_oaipmh.md @@ -0,0 +1,40 @@ + +Martin did a fresh scrape of many OAI-PMH endpoints, and we should ingest/crawl. + +Note that Martin excluded many Indonesian endpoints, will need to follow-up on +those. + +## Prep + +Fetch metadata snapshot: + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01.ndj.zst + + wget https://archive.org/download/oai_pmh_partial_dump_2022_03_01/oai_pmh_partial_dump_2022_03_01_urls.txt.zst + +Pre-filter out a bunch of prefixes we won't crawl (out of scope, and large): + + zstdcat /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.ndj.zst \ + | rg -v 'oai:kb.dk:' \ + | rg -v 'oai:bdr.oai.bsb-muenchen.de:' \ + | rg -v 'oai:hispana.mcu.es:' \ + | rg -v 'oai:bnf.fr:' \ + | rg -v 'oai:ukm.si:' \ + | rg -v 'oai:biodiversitylibrary.org:' \ + | rg -v 'oai:hsp.org:' \ + | rg -v 'oai:repec:' \ + | rg -v 'oai:n/a:' \ + | rg -v 'oai:quod.lib.umich.edu:' \ + | rg -v 'oai:americanae.aecid.es:' \ + | rg -v 'oai:www.irgrid.ac.cn:' \ + | rg -v 'oai:espace.library.uq.edu:' \ + | rg -v 'oai:edoc.mpg.de:' \ + | rg -v 'oai:bibliotecadigital.jcyl.es:' \ + | rg -v 'oai:repository.erciyes.edu.tr:' \ + | rg -v 'oai:krm.or.kr:' \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/oai-pmh/oai_pmh_partial_dump_2022_03_01.requests.json.gz + +These failed to transform in the expected way; a change in JSON schema from last time? diff --git a/notes/ingest/2022-04_targeted.md b/notes/ingest/2022-04_targeted.md new file mode 100644 index 0000000..23fd35f --- /dev/null +++ b/notes/ingest/2022-04_targeted.md @@ -0,0 +1,144 @@ + +Want to do a crawl similar to recent "patch" crawls, where we run heritrix +crawls to "fill in" missing (`no-capture`) and failed dailing ingests (aka, +those requests coming from fatcat-changelog). + + export PATCHDATE=2022-04-20 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-04 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%doaj.org%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-04-20.rows.json'; + # COPY 4842749 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v www.archive.org \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + # 4.75M 0:01:44 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 1515829 www.jstage.jst.go.jp + 1052953 doi.org + 241704 arxiv.org + 219543 www.sciencedirect.com + 178562 www.persee.fr + 84947 zenodo.org + 67397 www.mdpi.com + 65775 journals.lww.com + 58216 opg.optica.org + 50673 osf.io + 45776 www.degruyter.com + 36664 www.indianjournals.com + 35287 pubs.rsc.org + 33495 www.bmj.com + 33320 www.research-collection.ethz.ch + 29728 www.e-periodica.ch + 28338 iopscience.iop.org + 26364 www.cambridge.org + 23840 onlinelibrary.wiley.com + 23641 platform.almanhal.com + 22660 brill.com + 20288 www.osapublishing.org + 18561 cgscholar.com + 18539 doi.nrct.go.th + 15677 www.frontiersin.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + +TODO: starting with the "quarterly retry" script/query might make more sense? +TODO: are there any cases where we do a bulk ingest request, fail, and `terminal_url` is not set? + +## Bulk Ingest Requests (post-crawl) + + cd /srv/sandcrawler/src/python + sudo su sandcrawler + pipenv run ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json + => 4.84M 0:03:14 [24.9k/s] + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => started 2022-05-11 diff --git a/notes/ingest/2022-04_unpaywall.md b/notes/ingest/2022-04_unpaywall.md new file mode 100644 index 0000000..bc78998 --- /dev/null +++ b/notes/ingest/2022-04_unpaywall.md @@ -0,0 +1,278 @@ + +New unpaywall snapshot from `2022-03-09`. + +This will probably be the last unpaywall crawl? Will switch to openalex in the +future, because we can automate that ingest process, and run it on our own +schedule. + + export SNAPSHOT=2022-03-09 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=UNPAYWALL-CRAWL-2022-04 + +## Download and Archive + + wget 'https://unpaywall-data-snapshots.s3.us-west-2.amazonaws.com/unpaywall_snapshot_2022-03-09T083001.jsonl.gz' + # 2022-04-09 22:31:43 (98.9 KB/s) - ‘unpaywall_snapshot_2022-03-09T083001.jsonl.gz’ saved [29470830470/29470830470] + + export SNAPSHOT=2022-03-09 + ia upload unpaywall_snapshot_$SNAPSHOT unpaywall_snapshot_$SNAPSHOT*.jsonl.gz -m title:"Unpaywall Metadata Snapshot ($SNAPSHOT)" -m collection:ia_biblio_metadata -m creator:creator -m date:$SNAPSHOT + + # if needed + scp unpaywall_snapshot_$SNAPSHOT*.jsonl.gz wbgrp-svc506.us.archive.org:/srv/sandcrawler/tasks + +## Transform and Load + + # in sandcrawler pipenv on sandcrawler1-vm (svc506) + cd /srv/sandcrawler/src/python + sudo su sandcrawler + pipenv shell + + zcat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT*.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json + # 34.9M 3:02:32 [3.19k/s] + + cat /srv/sandcrawler/tasks/unpaywall_snapshot_$SNAPSHOT.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + # 34.9M 5:23:15 [1.80k/s] + # Worker: Counter({'total': 34908779, 'insert-requests': 6129630, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 34908779, 'pushed': 34908779}) + +So about 6.1M new ingest request rows. + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- take "all time" instead of just this recent capture + -- AND date(ingest_request.created) > '2021-01-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2022-03-09.rows.json'; + => COPY 6025671 + + # transform + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json + # 6.03M 0:03:26 [29.1k/s] + + # enqueue for bulk processing + cat /srv/sandcrawler/tasks/unpaywall_noingest_$SNAPSHOT.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + +Only the recent bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2022-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 3330232 + success | 2455102 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16078 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +After prior "TARGETED" crawl and bulk ingest finished: + + status | count + -------------------------+--------- + no-capture | 3330055 + success | 2455279 + redirect-loop | 197117 + terminal-bad-status | 82618 + no-pdf-link | 33046 + blocked-cookie | 16079 + link-loop | 6745 + wrong-mimetype | 3416 + wayback-error | 1385 + empty-blob | 1142 + cdx-error | 820 + body-too-large | 292 + bad-gzip-encoding | 281 + wayback-content-error | 267 + | 253 + petabox-error | 215 + skip-url-blocklist | 185 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +Almost no change, which makes sense because of the `ingest_request.created` +filter. + + +## Dump Seedlist + +Dump rows for crawling: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + -- AND date(ingest_request.created) > '2022-04-01' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status LIKE 'spn2-%' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%.archive.org%' + AND ingest_request.base_url NOT LIKE '%://archive.org%' + AND ingest_request.base_url NOT LIKE '%://doi.org/10.48550/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%.archive.org%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json'; + => before ingest and arxiv.org DOI exclusion: COPY 3309091 + => COPY 3308914 + + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | pv -l > /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json + => 3.31M 0:02:22 [23.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.no_terminal_url.txt /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.terminal_url.txt | awk '{print "F+ " $1}' | shuf > /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule + + wc -l /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT* + 15 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.no_terminal_url.txt + 3308914 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.rows.json + 3028879 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.terminal_url.txt + 3038725 /srv/sandcrawler/tasks/unpaywall_seedlist_2022-03-09.url.txt + +Inject seedlist into crawler: + + scp /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/unpaywall_seedlist_$SNAPSHOT.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + +Top domains? + + cat /srv/sandcrawler/tasks/unpaywall_seedlist_$SNAPSHOT.schedule | cut -f2 -d' ' | cut -f3 -d/ | sort -S 4G | uniq -c | sort -nr | head -n20 + 158497 www.scielo.br + 144732 onlinelibrary.wiley.com + 129349 www.researchsquare.com + 94923 hal.archives-ouvertes.fr + 69293 openresearchlibrary.org + 64584 www.cell.com + 60033 link.springer.com + 50528 www.degruyter.com + 49737 projecteuclid.org + 45841 www.jstage.jst.go.jp + 44819 www.mdpi.com + 44325 ieeexplore.ieee.org + 38091 dr.lib.iastate.edu + 31030 www.nature.com + 30300 discovery.ucl.ac.uk + 27692 ntrs.nasa.gov + 24215 orca.cardiff.ac.uk + 23653 www.frontiersin.org + 23474 pure.rug.nl + 22660 www.sciencedirect.com + + +## Post-Crawl bulk ingest + + # enqueue for bulk processing + cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_$SNAPSHOT.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # done: 2022-07-06 + +## Post-Crawl, Post-Ingest Stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2022-04-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 4784948 => +2,329,669 ~77% + redirect-loop | 485270 => + 288,153 ~10% + no-capture | 317598 => -3,012,457 + terminal-bad-status | 267853 => + 185,235 ~ 6% + no-pdf-link | 118303 => + 85,257 + blocked-cookie | 111373 => + 95,294 + skip-url-blocklist | 19368 + link-loop | 9091 + wrong-mimetype | 7163 + cdx-error | 2516 + empty-blob | 1961 + wayback-error | 1922 + body-too-large | 509 + petabox-error | 416 + wayback-content-error | 341 + bad-gzip-encoding | 281 + | 253 + null-body | 179 + spn2-cdx-lookup-failure | 89 + gateway-timeout | 73 + (20 rows) + +Groovy! diff --git a/notes/ingest/2022-07-15_ingest_fixes.md b/notes/ingest/2022-07-15_ingest_fixes.md new file mode 100644 index 0000000..ec31a7d --- /dev/null +++ b/notes/ingest/2022-07-15_ingest_fixes.md @@ -0,0 +1,831 @@ + +## HTML `html-resource-no-capture` Fixes + +Tracing down some `html-resource-no-capture` issues. Eg, `javascript:` resources causing errors. + +SQL query: + + select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' limit 100; + select * from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture' order by random() limit 100; + + select count(*) from ingest_file_result where ingest_type = 'html' and status = 'html-resource-no-capture'; + => 210,528 + +http://agroengineering.it/index.php/jae/article/view/568/609 +- old capture, from `20171017204935` +- missing .css file; seems like an actual case of missing content? +- TODO: re-crawl/re-ingest when CDX is old + +https://www.karger.com/Article/FullText/484130 +- missing: https://www.karger.com/WebMaterial/ShowThumbnail/895999?imgType=2 +- resource is live +- this was from DOI-LANDING crawl, no resources captured +- TODO: re-crawl + +https://www.mdpi.com/1996-1073/13/21/5563/htm +- missing: https://www.mdpi.com/1996-1073/13/21/5563/htm +- common crawl capture; no/few resources? +- TODO: re-crawl + +http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0100-736X2013000500011&lng=en&tlng=en +- missing: http://www.scielo.br/img/revistas/pvb/v33n5/a11tab01.jpg + not on live web +- old (2013) wide crawl +- TODO: re-crawl + +http://g3journal.org/lookup/doi/10.1534/g3.116.027730 +- missing: http://www.g3journal.org/sites/default/files/highwire/ggg/6/8/2553/embed/mml-math-4.gif +- old 2018 landing crawl (no resources) +- TODO: re-crawl + +https://www.frontiersin.org/articles/10.3389/fimmu.2020.576134/full +- "error_message": "revisit record missing URI and/or DT: warc:abc.net.au-news-20220328-130654/IA-FOC-abc.net.au-news-20220618135308-00003.warc.gz offset:768320762" +- specific URL: https://www.frontiersin.org/areas/articles/js/app?v=uC9Es8wJ9fbTy8Rj4KipiyIXvhx7XEVhCTHvIrM4ShA1 +- archiveteam crawl +- seems like a weird corner case. look at more 'frontiersin' articles, and re-crawl this page + +https://www.frontiersin.org/articles/10.3389/fonc.2020.01386/full +- WORKING + +https://doi.org/10.4000/trajectoires.2317 +- redirect: https://journals.openedition.org/trajectoires/2317 +- missing: "https://journals.openedition.org/trajectoires/Ce fichier n'existe pas" (note spaces) +- FIXED + +http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S1413-81232002000200008&lng=en&tlng=en +- WORKING + +https://f1000research.com/articles/9-571/v2 +- petabox-error on 'https://www.recaptcha.net/recaptcha/api.js' +- added recaptcha.net to blocklist +- still needs a re-crawl +- SPN capture, from 2020, but images were missing? +- re-capture has images (though JS still wonky) +- TODO: re-crawl with SPN2 + +http://bio.biologists.org/content/4/9/1163 +- DOI LANDING crawl, no sub-resources +- TODO: recrawl + +http://err.ersjournals.com/content/26/145/170039.full +- missing: http://err.ersjournals.com/sites/default/files/highwire/errev/26/145/170039/embed/graphic-5.gif + on live web +- 2017 targetted heritrix crawl +- TODO: recrawl + +http://www.dovepress.com/synthesis-characterization-and-antimicrobial-activity-of-an-ampicillin-peer-reviewed-article-IJN +- missing: https://www.dovepress.com/cr_data/article_fulltext/s61000/61143/img/IJN-61143-F02-Thumb.jpg +- recent archiveteam crawl +- TODO: recrawl + +http://journals.ed.ac.uk/lithicstudies/article/view/1444 +- missing: http://journals.ed.ac.uk/lithicstudies/article/download/1444/2078/6081 +- common crawl +- TODO: recrawl + +http://medisan.sld.cu/index.php/san/article/view/495 +- missing: http://ftp.scu.sld.cu/galen/medisan/logos/redib.jpg +- this single resource is legit missing + +seems like it probably isn't a bad idea to just re-crawl all of these with fresh SPNv2 requests + +request sources: +- fatcat-changelog (doi) +- fatcat-ingest (doi) +- doaj + + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'html' + AND ingest_file_result.status = 'html-resource-no-capture' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + ) + ) TO '/srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json'; + => COPY 210749 + + ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json + +Try a sample of 300: + + shuf -n300 /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Seeing a bunch of: + + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fphys.2020.00454/full","https://www.frontiersin.org/articles/10.3389/fphys.2020.00454/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"] + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fmicb.2019.02507/full","https://www.frontiersin.org/articles/10.3389/fmicb.2019.02507/full","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:937365431"] + ["doaj","wayback-content-error","https://www.mdpi.com/2218-1989/10/9/366","https://www.mdpi.com/2218-1989/10/9/366/htm","revisit record missing URI and/or DT: warc:foxnews.com-20220402-051934/IA-FOC-foxnews.com-20220712070651-00000.warc.gz offset:964129887"] + + "error_message": "revisit record missing URI and/or DT: warc:online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz offset:751923069", + + + ["doaj","wayback-content-error","https://www.frontiersin.org/article/10.3389/fnins.2020.00724/full","https://www.frontiersin.org/articles/10.3389/fnins.2020.00724/full","wayback payload sha1hex mismatch: 20220715222216 https://static.frontiersin.org/areas/articles/js/app?v=DfnFHSIgqDJBKQy2bbQ2S8vWyHe2dEMZ1Lg9o6vSS1g1"] + +These seem to be transfer encoding issues; fixed? + + ["doaj","html-resource-no-capture","http://www.scielosp.org/scielo.php?script=sci_arttext&pid=S0021-25712013000400003&lng=en&tlng=en","https://scielosp.org/article/aiss/2013.v49n4/336-339/en/","HTML sub-resource not found: https://ssm.scielo.org/media/assets/css/scielo-print.css"] + +Full batch: + + # TODO: cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Not running the full batch for now, because there are almost all `wayback-content-error` issues. + + cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | wc -l + 114935 + + cat /srv/sandcrawler/tasks/retry_html_resourcenocapture.2022-07-15.json | rg -v frontiersin.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + + +## Redirect Loops + +Seems like there might have been a bug in how ingest pipeline dealt with +multiple redirects (eg, 301 to 302 or vice-versa), due to how CDX lookups and +normalization was happening. + +This could be a really big deal because we have over 11 million such ingest +requests! and may even have stopped crawling domains on the basis of redirect +looping. + + select * from ingest_file_result where ingest_type = 'pdf' and status = 'redirect-loop' limit 50; + +http://ieeexplore.ieee.org/iel7/7259950/7275573/07275755.pdf +- 'skip-url-blocklist' +- paywall on live web + +http://www.redjournal.org/article/S0360301616308276/pdf +- redirect to 'secure.jbs.elsevierhealth.com' +- ... but re-crawling with SPNv2 worked +- TODO: reingest this entire journal with SPNv2 + +http://www.jmirs.org/article/S1939865415001551/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL: success + +http://www.cell.com/article/S0006349510026147/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- TODO: try SPNv2? +- RECRAWL: success + +http://infoscience.epfl.ch/record/256431/files/SPL_2018.pdf +- FIXED: success + +http://www.nature.com/articles/hdy1994143.pdf +- blocked-cookie (idp.nature.com / cookies_not_supported) +- RECRAWL: gateway-timeout + +http://www.thelancet.com/article/S0140673619327606/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL: success + +https://pure.mpg.de/pubman/item/item_2065970_2/component/file_2065971/Haase_2014.pdf +- FIXED: success + +http://hdl.handle.net/21.11116/0000-0001-B1A2-F +- FIXED: success + +http://repositorio.ufba.br/ri/bitstream/ri/6072/1/%2858%29v21n6a03.pdf +- FIXED: success + +http://www.jto.org/article/S1556086416329999/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL spn2: success + +http://www.jahonline.org/article/S1054139X16303020/pdf +- blocked-cookie (secure.jbs.elsevierhealth.com) +- RECRAWL spn2: success + +So, wow wow wow, a few things to do here: + +- just re-try all these redirect-loop attempts to update status +- re-ingest all these elsevierhealth blocked crawls with SPNv2. this could take a long time! + +Possibly the elsevierhealth stuff will require some deeper fiddling to crawl +correctly. + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'redirect-loop' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json'; + => COPY 6611342 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.rows.json > /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json + +Start with a sample: + + shuf -n200 /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Wow that is a lot of ingest! And a healthy fraction of 'success', almost all +via unpaywall (maybe should have done DOAJ/DOI only first). Let's do this full +batch: + + cat /srv/sandcrawler/tasks/retry_redirectloop.2022-07-15.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +TODO: repeat with broader query (eg, OAI-PMH, MAG, etc). + +## Other + +Revist resolution failed: \"Didn't get exact CDX url/datetime match. url:https://www.cairn.info/static/images//logo/logo-cairn-negatif.png dt:20220430145322 got:CdxRow(surt='info,cairn)/static/images/logo/logo-cairn-negatif.png', datetime='20220430145322', url='https://www.cairn.info/static/images/logo/logo-cairn-negatif.png', mimetype='image/png', status_code=200, sha1b32='Y3VQOPO2NFUR2EUWNXLYGYGNZPZLQYHU', sha1hex='c6eb073dda69691d12966dd78360cdcbf2b860f4', warc_csize=10875, warc_offset=2315284914, warc_path='archiveteam_archivebot_go_20220430212134_59230631/old.worldurbancampaign.org-inf-20220430-140628-acnq5-00000.warc.gz')\"" + + https://www.cairn.info/static/images//logo/logo-cairn-negatif.png 20220430145322 + https://www.cairn.info/static/images/logo/logo-cairn-negatif.png 20220430145322 + +Fixed! + + +## Broken WARC Record? + +cdx line: + + net,cloudfront,d1bxh8uas1mnw7)/assets/embed.js 20220716084026 https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js warc/revisit - U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB - - 660 751923069 online.wsj.com-home-page-20220324-211958/IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz + +download WARC and run: + + zcat IA-FOC-online.wsj.com-home-page-20220716075018-00001.warc.gz | rg d1bxh8uas1mnw7.cloudfront.net/assets/embed.js -a -C 20 + +the WARC record: + + WARC/1.0 + WARC-Type: revisit + WARC-Target-URI: https://d1bxh8uas1mnw7.cloudfront.net/assets/embed.js + WARC-Date: 2022-07-16T08:40:26Z + WARC-Payload-Digest: sha1:U5E5UA6DS5GGCHJ2IZSOIEGPN6P64JRB + WARC-IP-Address: 13.227.21.220 + WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest + WARC-Truncated: length + WARC-Record-ID: <urn:uuid:cc79139e-d43f-4b43-9b9e-f923610344d0> + Content-Type: application/http; msgtype=response + Content-Length: 493 + + HTTP/1.1 200 OK + Content-Type: application/javascript + Content-Length: 512 + Connection: close + Last-Modified: Fri, 22 Apr 2022 08:45:38 GMT + Accept-Ranges: bytes + Server: AmazonS3 + Date: Fri, 15 Jul 2022 16:36:08 GMT + ETag: "1c28db48d4012f0221b63224a3bb7137" + Vary: Accept-Encoding + X-Cache: Hit from cloudfront + Via: 1.1 5b475307685b5cecdd0df414286f5438.cloudfront.net (CloudFront) + X-Amz-Cf-Pop: SFO20-C1 + X-Amz-Cf-Id: SIRR_1LT8mkp3QVaiGYttPuomxyDfJ-vB6dh0Slg_qqyW0_WwnA1eg== + Age: 57859 + +where are the `WARC-Refers-To-Target-URI` and `WARC-Refers-To-Date` lines? + +## osf.io + + select status, terminal_status_code, count(*) from ingest_file_result where base_url LIKE 'https://doi.org/10.17605/osf.io/%' and ingest_type = 'pdf' group by status, terminal_status_code order by count(*) desc limit 30; + + status | terminal_status_code | count + -------------------------+----------------------+------- + terminal-bad-status | 404 | 92110 + no-pdf-link | 200 | 46932 + not-found | 200 | 20212 + no-capture | | 8599 + success | 200 | 7604 + redirect-loop | 301 | 2125 + terminal-bad-status | 503 | 1657 + cdx-error | | 1301 + wrong-mimetype | 200 | 901 + terminal-bad-status | 410 | 364 + read-timeout | | 167 + wayback-error | | 142 + gateway-timeout | | 139 + terminal-bad-status | 500 | 76 + spn2-error | | 63 + spn2-backoff | | 42 + petabox-error | | 39 + spn2-backoff | 200 | 27 + redirect-loop | 302 | 19 + terminal-bad-status | 400 | 15 + terminal-bad-status | 401 | 15 + remote-server-error | | 14 + timeout | | 11 + terminal-bad-status | | 11 + petabox-error | 200 | 10 + empty-blob | 200 | 8 + null-body | 200 | 6 + spn2-error:unknown | | 5 + redirect-loop | 308 | 4 + spn2-cdx-lookup-failure | | 4 + (30 rows) + +Many of these are now non-existant, or datasets/registrations not articles. +Hrm. + + +## Large DOAJ no-pdf-link Domains + + SELECT + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain, + COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_request.base_url = ingest_file_result.base_url + WHERE + ingest_file_result.status = 'no-pdf-link' + AND ingest_request.link_source = 'doaj' + GROUP BY + domain + ORDER BY + COUNT(*) DESC + LIMIT 50; + + domain | count + -------------------------------------------------------+-------- + www.sciencedirect.com | 211090 + auth.openedition.org | 20741 + journal.frontiersin.org:80 | 11368 + journal.frontiersin.org | 6494 + ejde.math.txstate.edu | 4301 + www.arkat-usa.org | 4001 + www.scielo.br | 3736 + www.lcgdbzz.org | 2892 + revistas.uniandes.edu.co | 2715 + scielo.sld.cu | 2612 + www.egms.de | 2488 + journals.lww.com | 2415 + ter-arkhiv.ru | 2239 + www.kitlv-journals.nl | 2076 + www.degruyter.com | 2061 + jwcn-eurasipjournals.springeropen.com | 1929 + www.cjcnn.org | 1908 + www.aimspress.com | 1885 + vsp.spr-journal.ru | 1873 + dx.doi.org | 1648 + www.dlib.si | 1582 + aprendeenlinea.udea.edu.co | 1548 + www.math.u-szeged.hu | 1448 + dergipark.org.tr | 1444 + revistas.uexternado.edu.co | 1429 + learning-analytics.info | 1419 + drive.google.com | 1399 + www.scielo.cl | 1326 + www.economics-ejournal.org | 1267 + www.jssm.org | 1240 + html.rhhz.net | 1232 + journalofinequalitiesandapplications.springeropen.com | 1214 + revistamedicina.net | 1197 + filclass.ru | 1154 + ceramicayvidrio.revistas.csic.es | 1152 + gynecology.orscience.ru | 1126 + www.tobaccoinduceddiseases.org | 1090 + www.tandfonline.com | 1046 + www.querelles-net.de | 1038 + www.swjpcc.com | 1032 + microbiologyjournal.org | 1028 + revistas.usal.es | 1027 + www.medwave.cl | 1023 + ijtech.eng.ui.ac.id | 1023 + www.scielo.sa.cr | 1021 + vestnik.szd.si | 986 + www.biomedcentral.com:80 | 984 + scielo.isciii.es | 983 + bid.ub.edu | 970 + www.meirongtv.com | 959 + (50 rows) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ejde.math.txstate.edu%' limit 5; + http://ejde.math.txstate.edu/Volumes/2018/30/abstr.html + http://ejde.math.txstate.edu/Volumes/2012/137/abstr.html + http://ejde.math.txstate.edu/Volumes/2016/268/abstr.html + http://ejde.math.txstate.edu/Volumes/2015/194/abstr.html + http://ejde.math.txstate.edu/Volumes/2014/43/abstr.html + # plain HTML, not really parse-able + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.arkat-usa.org%' limit 5; + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0013.909 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0007.717 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.p008.158 + https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0014.216 + # fixed (embed PDF) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.scielo.br%' limit 5; + https://doi.org/10.5935/0034-7280.20200075 + https://doi.org/10.5935/0004-2749.20200071 + https://doi.org/10.5935/0034-7280.20200035 + http://www.scielo.br/scielo.php?script=sci_arttext&pid=S1516-44461999000400014 + https://doi.org/10.5935/0034-7280.20200047 + # need recrawls? + # then success + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.lcgdbzz.org%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://revistas.uniandes.edu.co%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://scielo.sld.cu%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.egms.de%' limit 5; + https://doi.org/10.3205/16dgnc020 + http://nbn-resolving.de/urn:nbn:de:0183-19degam1126 + http://www.egms.de/en/meetings/dgpraec2019/19dgpraec032.shtml + http://www.egms.de/en/meetings/dkou2019/19dkou070.shtml + http://nbn-resolving.de/urn:nbn:de:0183-20nrwgu625 + # mostly abstracts, don't have PDF versions + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://ter-arkhiv.ru%' limit 5; + https://doi.org/10.26442/terarkh201890114-47 + https://doi.org/10.26442/00403660.2019.12.000206 + https://journals.eco-vector.com/0040-3660/article/download/32246/pdf + https://journals.eco-vector.com/0040-3660/article/download/33578/pdf + https://doi.org/10.26442/00403660.2019.12.000163 + # working, needed recrawls (some force re-crawls) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.kitlv-journals.nl%' limit 5; + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.cjcnn.org%' limit 5; + + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.dlib.si%' limit 5; + https://srl.si/ojs/srl/article/view/2910 + https://srl.si/ojs/srl/article/view/3640 + https://srl.si/ojs/srl/article/view/2746 + https://srl.si/ojs/srl/article/view/2557 + https://srl.si/ojs/srl/article/view/2583 + # fixed? (dlib.si) + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.jssm.org%' limit 5; + http://www.jssm.org/vol4/n4/8/v4n4-8text.php + http://www.jssm.org/vol7/n1/19/v7n1-19text.php + http://www.jssm.org/vol9/n3/10/v9n3-10text.php + http://www.jssm.org/abstresearcha.php?id=jssm-14-347.xml + http://www.jssm.org/vol7/n2/11/v7n2-11text.php + # works as an HTML document? otherwise hard to select on PDF link + + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://filclass.ru%' limit 5; + https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism + https://filclass.ru/en/archive/2015/42/training-as-an-effective-form-of-preparation-for-the-final-essay + https://filclass.ru/en/archive/2020/vol-25-3/didaktizatsiya-literatury-rossijskikh-nemtsev-zanyatie-po-poeme-viktora-klyajna-jungengesprach + https://filclass.ru/en/archive/2015/40/the-communicative-behaviour-of-the-russian-intelligentsia-and-its-reflection-in-reviews-as-a-genre-published-in-online-literary-journals-abroad + https://filclass.ru/en/archive/2016/46/discoursive-means-of-implication-of-instructive-components-within-the-anti-utopia-genre + # fixed + # TODO: XXX: re-crawl/ingest + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://microbiologyjournal.org%' limit 5; + https://microbiologyjournal.org/the-relationship-between-the-type-of-infection-and-antibiotic-resistance/ + https://microbiologyjournal.org/antimicrobial-resistant-shiga-toxin-producing-escherichia-coli-isolated-from-ready-to-eat-meat-products-and-fermented-milk-sold-in-the-formal-and-informal-sectors-in-harare-zimbabwe/ + https://microbiologyjournal.org/emerging-antibiotic-resistance-in-mycoplasma-microorganisms-designing-effective-and-novel-drugs-therapeutic-targets-current-knowledge-and-futuristic-prospects/ + https://microbiologyjournal.org/microbiological-and-physicochemicalpropertiesofraw-milkproduced-from-milking-to-delivery-to-milk-plant/ + https://microbiologyjournal.org/association-of-insulin-based-insulin-resistance-with-liver-biomarkers-in-type-2-diabetes-mellitus/ + # HTML article, no PDF + # ... but only sometimes + + select base_url from ingest_file_result where ingest_type = 'pdf' and status = 'no-pdf-link' and terminal_url like 'https://www.medwave.cl%' limit 5; + http://www.medwave.cl/link.cgi/Medwave/Perspectivas/Cartas/6878 + https://www.medwave.cl/link.cgi/Medwave/Revisiones/RevisionClinica/8037.act + http://dx.doi.org/10.5867/medwave.2012.03.5332 + https://www.medwave.cl/link.cgi/Medwave/Estudios/Casos/7683.act + http://www.medwave.cl/link.cgi/Medwave/Revisiones/CAT/5964 + # HTML article, no PDF + +Re-ingest HTML: + + https://fatcat.wiki/container/mafob4ewkzczviwipyul7knndu (DONE) + https://fatcat.wiki/container/6rgnsrp3rnexdoks3bxcmbleda (DONE) + +Re-ingest PDF: + + doi_prefix:10.5935 (DONE) + doi_prefix:10.26442 + +## More Scielo + +More scielo? `doi_prefix:10.5935 in_ia:false` + + http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873 + # OJS? fixed + + https://revistas.unicentro.br/index.php/repaa/article/view/2667/2240 + # working, but needed re-crawl + + http://www.rbcp.org.br/details/2804/piezoelectric-preservative-rhinoplasty--an-alternative-approach-for-treating-bifid-nose-in-tessier-no--0-facial-cleft + +A few others, mostly now working + +## Recent OA DOIs + + fatcat-cli search release 'is_oa:true (type:article-journal OR type:article OR type:paper-conference) !doi_prefix:10.5281 !doi_prefix:10.6084 !doi_prefix:10.48550 !doi_prefix:10.25446 !doi_prefix:10.25384 doi:* date:>2022-06-15 date:<2022-07-15 in_ia:false !publisher_type:big5' --index-json --limit 0 | pv -l > recent_missing_oa.json + + wc -l recent_missing_oa.json + 24433 + + cat recent_missing_oa.json | jq .doi_prefix -r | sort | uniq -c | sort -nr | head + 4968 10.3390 + 1261 10.1080 + 687 10.23668 + 663 10.1021 + 472 10.1088 + 468 10.4000 + 367 10.3917 + 357 10.1364 + 308 10.4230 + 303 10.17863 + + cat recent_missing_oa.json | jq .doi_registrar -r | sort | uniq -c | sort -nr + 19496 crossref + 4836 datacite + 101 null + + cat recent_missing_oa.json | jq .publisher_type -r | sort | uniq -c | sort -nr + 9575 longtail + 8419 null + 3861 society + 822 unipress + 449 oa + 448 scielo + 430 commercial + 400 repository + 22 other + 7 archive + + cat recent_missing_oa.json | jq .publisher -r | sort | uniq -c | sort -nr | head + 4871 MDPI AG + 1107 Informa UK (Taylor & Francis) + 665 EAG-Publikationen + 631 American Chemical Society + 451 IOP Publishing + 357 The Optical Society + 347 OpenEdition + 309 CAIRN + 308 Schloss Dagstuhl - Leibniz-Zentrum für Informatik + 303 Apollo - University of Cambridge Repository + + cat recent_missing_oa.json | jq .container_name -r | sort | uniq -c | sort -nr | head + 4908 null + 378 Sustainability + 327 ACS Omega + 289 Optics Express + 271 International Journal of Environmental Research and Public Health + 270 International Journal of Health Sciences + 238 Sensors + 223 International Journal of Molecular Sciences + 207 Molecules + 193 Proceedings of the National Academy of Sciences of the United States of America + + cat recent_missing_oa.json \ + | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \ + | wc -l + 16558 + + cat recent_missing_oa.json | rg -i mdpi | shuf -n10 | jq .doi -r + 10.3390/molecules27144419 + => was a 404 + => recrawl was successful + 10.3390/math10142398 + => was a 404 + 10.3390/smartcities5030039 + => was a 404 + +Huh, we need to re-try/re-crawl MDPI URLs every week or so? Or special-case this situation. +Could be just a fatcat script, or a sandcrawler query. + + cat recent_missing_oa.json \ + | rg -v "(MDPI|Informa UK|American Chemical Society|IOP Publishing|CAIRN|OpenEdition)" \ + | shuf -n10 | jq .doi -r + + https://doi.org/10.18452/24860 + => success (just needed quarterly retry?) + => b8c6c86aebd6cd2d85515441bbce052bcff033f2 (not in fatcat.wiki) + => current status is "bad-redirect" + https://doi.org/10.26181/20099540.v1 + => success + => 3f9b1ff2a09f3ea9051dbbef277579e8a0b4df30 + => this is figshare, and versioned. PDF was already attached to another DOI: https://doi.org/10.26181/20099540 + https://doi.org/10.4230/lipics.sea.2022.22 + => there is a bug resulting in trailing slash in `citation_pdf_url` + => fixed as a quirks mode + => emailed to report + https://doi.org/10.3897/aca.5.e89679 + => success + => e6fd1e066c8a323dc56246631748202d5fb48808 + => current status is 'bad-redirect' + https://doi.org/10.1103/physrevd.105.115035 + => was 404 + => success after force-recrawl of the terminal URL (not base URL) + https://doi.org/10.1155/2022/4649660 + => was 404 + => success after force-recrawl (of base_url) + https://doi.org/10.1090/spmj/1719 + => paywall (not actually OA) + => https://fatcat.wiki/container/x6jfhegb3fbv3bcbqn2i3espiu is on Szczepanski list, but isn't all OA? + https://doi.org/10.1139/as-2022-0011 + => was no-pdf-link + => fixed fulltext URL extraction + => still needed to re-crawl terminal PDF link? hrm + https://doi.org/10.31703/grr.2022(vii-ii).02 + => was no-pdf-link + => fixed! success + https://doi.org/10.1128/spectrum.00154-22 + => was 404 + => now repeatably 503, via SPN + https://doi.org/10.51601/ijersc.v3i3.393 + => 503 server error + https://doi.org/10.25416/ntr.20137379.v1 + => is figshare + => docx (not PDF) + https://doi.org/10.25394/pgs.20263698.v1 + => figshare + => embargo'd + https://doi.org/10.24850/j-tyca-14-4-7 + => was no-pdf-link + => docs.google.com/viewer (!) + => now handle this (success) + https://doi.org/10.26267/unipi_dione/1832 + => was bad-redirect + => success + https://doi.org/10.25560/98019 + => body-too-large + => also, PDF metadata fails to parse + => is actually like 388 MByte + https://doi.org/10.14738/abr.106.12511 + => max-hops-exceeded + => bumped max-hops from 6 to 8 + => then success (via google drive) + https://doi.org/10.24350/cirm.v.19933803 + => video, not PDF + https://doi.org/10.2140/pjm.2022.317.67 + => link-loop + => not actually OA + https://doi.org/10.26265/polynoe-2306 + => was bad-redirect + => now success + https://doi.org/10.3389/fpls.2022.826875 + => frontiers + => was terminal-bad-status (403) + => success on retry (not sure why) + => maybe this is also a date-of-publication thing? + => not sure all these should be retried though + https://doi.org/10.14198/medcom.22240 + => was terminal-bad-status (404) + => force-recrawl resulted in an actual landing page, but still no-pdf-link + => but actual PDF is a real 404, it seems. oh well + https://doi.org/10.31729/jnma.7579 + => no-capture + https://doi.org/10.25373/ctsnet.20146931.v2 + => figshare + => video, not document or PDF + https://doi.org/10.1007/s42600-022-00224-0 + => not yet crawled/attempted (!) + => springer + => not actually OA + https://doi.org/10.37391/ijeer.100207 + => some upstream issue (server not found) + https://doi.org/10.1063/5.0093946 + => aip.scitation.org, is actually OA (can download in browser) + => cookie trap? + => redirect-loop (seems like a true redirect loop) + => retrying the terminal PDF URL seems to have worked + https://doi.org/10.18502/jchr.v11i2.9998 + => no actual fulltext on publisher site + https://doi.org/10.1128/spectrum.01144-22 + => this is a 503 error, even after retrying. weird! + +DONE: check `publisher_type` in chocula for: +- "MDPI AG" +- "Informa UK (Taylor & Francis)" + + cat recent_missing_oa.json | jq '[.publisher, .publisher_type]' -c | sort | uniq -c | sort -nr | head -n40 + 4819 ["MDPI AG","longtail"] + 924 ["Informa UK (Taylor & Francis)",null] + 665 ["EAG-Publikationen",null] + 631 ["American Chemical Society","society"] + 449 ["IOP Publishing","society"] + 357 ["The Optical Society","society"] + 336 ["OpenEdition","oa"] + 309 ["CAIRN","repository"] + 308 ["Schloss Dagstuhl - Leibniz-Zentrum für Informatik",null] + 303 ["Apollo - University of Cambridge Repository",null] + 292 ["Springer (Biomed Central Ltd.)",null] + 275 ["Purdue University Graduate School",null] + 270 ["Suryasa and Sons","longtail"] + 257 ["La Trobe",null] + 216 ["Frontiers Media SA","longtail"] + 193 ["Proceedings of the National Academy of Sciences","society"] + 182 ["Informa UK (Taylor & Francis)","longtail"] + 176 ["American Physical Society","society"] + 168 ["Institution of Electrical Engineers","society"] + 166 ["Oxford University Press","unipress"] + 153 ["Loughborough University",null] + + chocula mostly seems to set these correctly. is the issue that the chocula + computed values aren't coming through or getting updated? probably. both + the release (from container) metadata update; and chocula importer not + doing updates based on this field; and some old/incorrect values. + + did some cleanups of specific containers, and next chocula update should + result in a bunch more `publisher_type` getting populated on older + containers + + +TODO: verify URLs are actualy URLs... somewhere? in the ingest pipeline + +TODO: fatcat: don't ingest figshare "work" DOIs, only the "versioned" ones (?) + doi_prefix:10.26181 + +WIP: sandcrawler: regularly (weekly?) re-try 404 errors (the terminal URL, not the base url?) (or, some kind of delay?) + doi_prefix:10.3390 (MDPI) + doi_prefix:10.1103 + doi_prefix:10.1155 + +DONE: simply re-ingest all: + doi_prefix:10.4230 + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf query 'doi_prefix:10.4230' + # Counter({'ingest_request': 2096, 'elasticsearch_release': 2096, 'estimate': 2096, 'kafka': 2096}) + container_65lzi3vohrat5nnymk3dqpoycy + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 65lzi3vohrat5nnymk3dqpoycy + # Counter({'ingest_request': 187, 'elasticsearch_release': 187, 'estimate': 187, 'kafka': 187}) + container_5vp2bio65jdc3blx6rfhp3chde + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --ingest-type pdf container --container-id 5vp2bio65jdc3blx6rfhp3chde + # Counter({'ingest_request': 83, 'elasticsearch_release': 83, 'estimate': 83, 'kafka': 83}) + +DONE: verify and maybe re-ingest all: + is_oa:true publisher:"Canadian Science Publishing" in_ia:false + + ./fatcat_ingest.py --env prod --enqueue-kafka --kafka-hosts wbgrp-svc280.us.archive.org,wbgrp-svc284.us.archive.org,wbgrp-svc350.us.archive.org --kafka-request-topic sandcrawler-prod.ingest-file-requests-daily --allow-non-oa --ingest-type pdf --force-recrawl query 'year:>2010 is_oa:true publisher:"Canadian Science Publishing" in_ia:false !journal:print' + # Counter({'ingest_request': 1041, 'elasticsearch_release': 1041, 'estimate': 1041, 'kafka': 1041}) + + +## Re-Ingest bad-redirect, max-hops-exceeded, and google drive + +Similar to `redirect-loop`: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'bad-redirect' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json'; + # COPY 100011 + # after first run: COPY 5611 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.status = 'max-hops-exceeded' + -- AND ingest_request.ingest_type = 'pdf' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json'; + # COPY 3546 + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.hit is false + AND ingest_file_result.terminal_url like 'https://docs.google.com/viewer%' + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'unpaywall' + ) + ) TO '/srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json'; + # COPY 1082 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.rows.json > /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json + + cat /srv/sandcrawler/tasks/retry_badredirect.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + cat /srv/sandcrawler/tasks/retry_maxhops.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + cat /srv/sandcrawler/tasks/retry_googledocs.2022-07-20.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + # DONE diff --git a/notes/ingest/2022-07-19_dblp.md b/notes/ingest/2022-07-19_dblp.md new file mode 100644 index 0000000..74aeb8d --- /dev/null +++ b/notes/ingest/2022-07-19_dblp.md @@ -0,0 +1,50 @@ + +Cross-posting from fatcat bulk metadata update/ingest. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +## Post-Crawl Stats + +This is after bulk ingest, crawl, and a bit of "live" re-ingest. Query run +2022-09-06: + + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'dblp' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-----------------------+-------- + pdf | success | 305142 + pdf | no-pdf-link | 192683 + pdf | no-capture | 42634 + pdf | terminal-bad-status | 38041 + pdf | skip-url-blocklist | 31055 + pdf | link-loop | 9263 + pdf | wrong-mimetype | 4545 + pdf | redirect-loop | 3952 + pdf | empty-blob | 2705 + pdf | wayback-content-error | 834 + pdf | wayback-error | 294 + pdf | petabox-error | 202 + pdf | blocked-cookie | 155 + pdf | cdx-error | 115 + pdf | body-too-large | 66 + pdf | bad-redirect | 19 + pdf | timeout | 7 + pdf | bad-gzip-encoding | 4 + (18 rows) + +That is quite a lot of `no-pdf-link`, might be worth doing a random sample +and/or re-ingest. And a chunk of `no-capture` to retry. diff --git a/notes/ingest/2022-07_doaj.md b/notes/ingest/2022-07_doaj.md new file mode 100644 index 0000000..7e55633 --- /dev/null +++ b/notes/ingest/2022-07_doaj.md @@ -0,0 +1,199 @@ + +This is just a load and bulk ingest; will do a separate 'TARGETED' crawl for +heritrix bulk crawling, along with JALC and DOAJ URLs. + + export SNAPSHOT=2022-07-20 + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget "https://archive.org/download/doaj_data_${SNAPSHOT}/doaj_article_data_${SNAPSHOT}_all.json.gz" + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz + # 9.72M 0:36:28 [4.44k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_${SNAPSHOT}_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # 9.72M 0:17:04 [9.49k/s] + # Worker: Counter({'total': 9721097, 'insert-requests': 809681, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9721097, 'pushed': 9721097}) + +Stats after this load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3165539 + pdf | | 2078874 + html | | 1547698 + html | wrong-scope | 1114332 + pdf | no-pdf-link | 517261 + html | success | 388376 + html | unknown-scope | 242044 + pdf | no-capture | 179030 + pdf | terminal-bad-status | 174741 + html | no-capture | 155323 + pdf | null-body | 129267 + pdf | redirect-loop | 127136 + html | html-resource-no-capture | 117275 + html | null-body | 100296 + pdf | blocked-cookie | 71093 + html | redirect-loop | 65519 + html | terminal-bad-status | 64856 + html | blocked-cookie | 64095 + html | spn2-backoff | 55173 + pdf | link-loop | 27440 + html | wrong-mimetype | 26016 + html | wayback-content-error | 20109 + xml | | 13624 + pdf | wrong-mimetype | 8411 + xml | success | 6899 + html | petabox-error | 6199 + html | wayback-error | 5269 + html | spn2-cdx-lookup-failure | 4635 + html | spn2-recent-capture | 4527 + xml | null-body | 2353 + (30 rows) + +## Bulk Ingest + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + -- AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + -- AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + -- AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-07-20.rows.json'; + # COPY 3962331 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json + # 3.96M 0:01:47 [36.7k/s] + +Top domains: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n20 + 789988 www.mdpi.com + 318142 www.frontiersin.org + 226316 link.springer.com + 204429 www.scielo.br + 201175 www.sciencedirect.com + 72852 ieeexplore.ieee.org + 68983 dx.doi.org + 33286 www.dovepress.com + 26020 elifesciences.org + 23838 www.cetjournal.it + 21102 mab-online.nl + 20242 www.revistas.usp.br + 16564 periodicos.uem.br + 15710 journals.openedition.org + 14514 dergipark.org.tr + 14072 apcz.umk.pl + 13924 ojs.minions.amsterdam + 13717 bmgn-lchr.nl + 13512 ojstest.minions.amsterdam + 10440 journals.asm.org + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_${SNAPSHOT}.requests.json | rg -v "dx.doi.org" | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # Done + +## Stats Again + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- ORDER BY ingest_request.ingest_type, COUNT DESC + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 4704006 + html | wrong-scope | 1761227 + html | success | 778165 + pdf | no-pdf-link | 759805 + html | no-capture | 382080 + html | unknown-scope | 313391 + html | html-resource-no-capture | 292953 + pdf | no-capture | 290311 + pdf | terminal-bad-status | 271776 + pdf | null-body | 129267 + pdf | blocked-cookie | 108491 + html | terminal-bad-status | 103014 + html | null-body | 100296 + html | blocked-cookie | 88533 + pdf | | 81517 + pdf | skip-url-blocklist | 76443 + html | spn2-backoff | 50615 + pdf | link-loop | 45516 + html | wrong-mimetype | 33525 + html | wayback-content-error | 25535 + pdf | empty-blob | 21431 + pdf | redirect-loop | 19795 + html | petabox-error | 18291 + html | empty-blob | 14391 + pdf | wrong-mimetype | 14084 + html | redirect-loop | 12856 + xml | success | 10381 + xml | no-capture | 10008 + html | skip-url-blocklist | 3294 + html | cdx-error | 3275 + (30 rows) + +Pretty good success rate for PDFs. That is a lot of `no-capture`! And why 81k +PDFs with no attempt at all? Maybe a filter, or bogus URLs. + +Over 1.5M new PDF success over this crawl iteration period, nice. diff --git a/notes/ingest/2022-07_targeted.md b/notes/ingest/2022-07_targeted.md new file mode 100644 index 0000000..415f23b --- /dev/null +++ b/notes/ingest/2022-07_targeted.md @@ -0,0 +1,140 @@ + +Heritrix follow-up crawl for recent bulk ingest of DOAJ, JALC, and DBLP URLs. + + export PATCHDATE=2022-07-29 + export CRAWLVM=wbgrp-svc279.us.archive.org + export CRAWLNAME=TARGETED-ARTICLE-CRAWL-2022-07 + +## Seedlist Query + +Terminal URLs dump: + + COPY ( + SELECT row_to_json(t) FROM ( + SELECT ingest_file_result.terminal_url, ingest_request.* + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ( + ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + ) + -- AND ingest_file_result.updated >= '2022-01-12' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status LIKE 'spn2-%' + OR ingest_file_result.status = 'gateway-timeout' + OR ( + ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + ) + ) + ) + AND ( + ingest_request.link_source = 'doi' + OR ingest_request.link_source = 'doaj' + OR ingest_request.link_source = 'dblp' + OR ingest_request.link_source = 'arxiv' + OR ingest_request.link_source = 'pmc' + -- OR ingest_request.link_source = 'unpaywall' + -- OR ingest_request.link_source = 'oai' + ) + + AND ingest_file_result.terminal_url NOT LIKE '%mdz-nbn-resolving.de%' + AND ingest_file_result.terminal_url NOT LIKE '%edoc.mpg.de%' + AND ingest_file_result.terminal_url NOT LIKE '%orcid.org%' + AND ingest_file_result.terminal_url NOT LIKE '%gateway.isiknowledge.com%' + AND ingest_file_result.terminal_url NOT LIKE '%europmc.org%' + AND ingest_file_result.terminal_url NOT LIKE '%arxiv.org%' + -- AND ingest_file_result.terminal_url NOT LIKE 'https://doi.org/10.%' + + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%researchgate.net%' + AND ingest_file_result.terminal_url NOT LIKE '%muse.jhu.edu%' + AND ingest_file_result.terminal_url NOT LIKE '%omicsonline.org%' + AND ingest_file_result.terminal_url NOT LIKE '%link.springer.com%' + AND ingest_file_result.terminal_url NOT LIKE '%ieeexplore.ieee.org%' + + -- AND ingest_file_result.terminal_url NOT LIKE '%zenodo.org%' + AND ingest_file_result.terminal_url NOT LIKE '%t2r2.star.titech.ac.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%www.google.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%figshare.com%' + -- AND ingest_file_result.terminal_url NOT LIKE '%springernature.figshare.com%' + AND ingest_file_result.terminal_url NOT LIKE '%www.archive.org%' + ) t + ) TO '/srv/sandcrawler/tasks/patch_ingest_request_2022-07-29.rows.json'; + => COPY 3524573 + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json \ + | rg -v "\\\\" \ + | jq -r .terminal_url \ + | rg '://' \ + | rg -i '^http' \ + | rg -v '://10\.' \ + | rg -v '://172\.' \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt + => 3.11M 0:01:08 [45.4k/s] + + # check top domains + cut -f3 -d/ /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | sort | uniq -c | sort -nr | head -n25 + 624948 doi.org + 382492 www.jstage.jst.go.jp + 275087 www.mdpi.com + 157134 www.persee.fr + 108979 www.sciencedirect.com + 94375 www.scielo.br + 50834 onlinelibrary.wiley.com + 49991 journals.lww.com + 30354 www.frontiersin.org + 27963 doaj.org + 27058 www.e-periodica.ch + 24147 dl.acm.org + 23389 aclanthology.org + 22086 www.research-collection.ethz.ch + 21589 medien.die-bonn.de + 18866 www.ingentaconnect.com + 18583 doi.nrct.go.th + 18271 repositories.lib.utexas.edu + 17634 hdl.handle.net + 16366 archives.datapages.com + 15146 cgscholar.com + 13987 dl.gi.de + 13188 www.degruyter.com + 12503 ethos.bl.uk + 12304 preprints.jmir.org + + cat /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.txt | awk '{print "F+ " $1}' > /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule + => done + + scp /srv/sandcrawler/tasks/patch_terminal_url.$PATCHDATE.schedule $CRAWLVM:/tmp + ssh $CRAWLVM sudo -u heritrix cp /tmp/patch_terminal_url.$PATCHDATE.schedule /0/ia-jobs/journal-crawls/$CRAWLNAME/action/ + + +## Re-Ingest + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json + => 3.52M 0:01:37 [36.2k/s] + +Ingest: + + cat /srv/sandcrawler/tasks/patch_ingest_request_$PATCHDATE.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md new file mode 100644 index 0000000..ac7c68f --- /dev/null +++ b/notes/ingest/2022-09_oaipmh.md @@ -0,0 +1,397 @@ + +Martin did another OAI-PMH bulk crawl, this time with the old JSON format: <https://archive.org/download/oai_harvest_20220921> + +I updated the transform script to block some additional domains. + + +## Prep + +Fetch the snapshot: + + cd /srv/sandcrawler/tasks/ + wget https://archive.org/download/oai_harvest_20220921/2022-09-21-oai-pmh-metadata-compat.jsonl.zst + +Transform to ingest requests: + + cd /srv/sandcrawler/src/python + git log | head -n1 + # commit dfd4605d84712eccb95a63e50b0bcb343642b433 + + pipenv shell + zstdcat /srv/sandcrawler/tasks/2022-09-21-oai-pmh-metadata-compat.jsonl.zst \ + | ./scripts/oai2ingestrequest.py - \ + | pv -l \ + | gzip \ + > /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz + # 16.1M 1:01:02 [4.38k/s] + +Curious about types, though this would probably be handled at fatcat ingest +time: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.types[]' -r | sort | uniq -c | sort -nr > oai_type_counts.txt + + head oai_type_counts.txt -n30 + 5623867 info:eu-repo/semantics/article + 5334928 info:eu-repo/semantics/publishedVersion + 3870359 text + 1240225 Text + 829169 Article + 769849 NonPeerReviewed + 665700 PeerReviewed + 648740 Peer-reviewed Article + 547857 article + 482906 info:eu-repo/semantics/bachelorThesis + 353814 Thesis + 329269 Student thesis + 262650 info:eu-repo/semantics/conferenceObject + 185354 Journal articles + 162021 info:eu-repo/semantics/doctoralThesis + 152079 Journal Article + 150226 Research Article + 130217 Conference papers + 127255 Artículo revisado por pares + 124243 Newspaper + 123908 ##rt.metadata.pkp.peerReviewed## + 123309 Photograph + 122981 info:eu-repo/semantics/masterThesis + 116719 Book + 108946 Image + 108216 Report + 107946 Other + 103562 masterThesis + 103038 info:eu-repo/semantics/other + 101404 StillImage + [...] + +And formats: + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | jq '.formats[]' -r | sort | uniq -c | sort -nr > oai_format_counts.txt + + head -n 20 oai_format_counts.txt + 11151928 application/pdf + 677413 text + 561656 text/html + 498518 image/jpeg + 231219 Text + 193638 text/xml + 147214 Image + 117073 image/jpg + 110872 pdf + 91323 image/tiff + 76948 bib + 75393 application/xml + 70244 Digitized from 35 mm. microfilm. + 68206 mods + 59227 PDF + 57677 application/epub+zip + 57602 application/octet-stream + 52072 text/plain + 51620 application/msword + 47227 audio/mpeg + +Also, just overall size (number of records): + + zstdcat 2022-09-21-oai-pmh-metadata-compat.jsonl.zst | wc -l + # 20,840,301 + +Next load in to sandcrawler DB: + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz | pv -l | ./persist_tool.py ingest-request - + + Traceback (most recent call last): + File "./persist_tool.py", line 311, in <module> + main() + File "./persist_tool.py", line 307, in main + args.func(args) + File "./persist_tool.py", line 119, in run_ingest_request + pusher.run() + File "/1/srv/sandcrawler/src/python/sandcrawler/workers.py", line 397, in run + self.worker.push_batch(batch) + File "/1/srv/sandcrawler/src/python/sandcrawler/persist.py", line 342, in push_batch + resp = self.db.insert_ingest_request(self.cur, irequests) + File "/1/srv/sandcrawler/src/python/sandcrawler/db.py", line 459, in insert_ingest_request + resp = psycopg2.extras.execute_values(cur, sql, rows, page_size=250, fetch=True) + File "/1/srv/sandcrawler/src/python/.venv/lib/python3.8/site-packages/psycopg2/extras.py", line 1270, in execute_values + cur.execute(b''.join(parts)) + psycopg2.errors.ProgramLimitExceeded: index row size 3400 exceeds btree version 4 maximum 2704 for index "ingest_request_base_url_idx" + DETAIL: Index row references tuple (6893121,3) in relation "ingest_request". + HINT: Values larger than 1/3 of a buffer page cannot be indexed. + Consider a function index of an MD5 hash of the value, or use full text indexing. + 15.7M 0:41:48 [6.27k/s] + +Darn, this means we won't get reasonable stats about how many rows were +inserted/updated. + +Patched the persist tool to skip very long URLs, and ran again (backwards, just +URLs which didn't get inserted already): + + zcat /srv/sandcrawler/tasks/2022-09-21_oaipmh_ingestrequests.json.gz \ + | tac \ + | head -n1000000 \ + | pv -l \ + | ./persist_tool.py ingest-request - + # 1.00M 0:03:04 [5.41k/s] + # Worker: Counter({'total': 1000000, 'insert-requests': 124701, 'skip-url-too-long': 1, 'update-requests': 0}) + +Status of just the new lines: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+--------- + | 6398455 + success | 540219 + no-pdf-link | 41316 + link-loop | 23871 + no-capture | 11350 + redirect-loop | 8315 + wrong-mimetype | 2394 + terminal-bad-status | 1540 + null-body | 1038 + cdx-error | 272 + empty-blob | 237 + petabox-error | 213 + wayback-error | 186 + blocked-cookie | 107 + timeout | 47 + wayback-content-error | 26 + spn2-cdx-lookup-failure | 21 + skip-url-blocklist | 16 + spn2-backoff | 15 + body-too-large | 13 + (20 rows) + + +## Bulk Ingest + +Should already have filtered domains/prefixes in transform script, so not +including filters here. + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ingest_file_result.status IS NULL + ) TO '/srv/sandcrawler/tasks/oai_noingest_20220921.rows.json'; + # COPY 6398455 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_noingest_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json + # 6.40M 0:02:18 [46.2k/s] + + cat /srv/sandcrawler/tasks/oai_noingest_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # DONE + +Expect this ingest to take a week or so. + +Then, run stats again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + no-capture | 3617175 + success | 2775036 + no-pdf-link | 449298 + link-loop | 74260 + terminal-bad-status | 47819 + wrong-mimetype | 20195 + redirect-loop | 18197 + empty-blob | 12127 + cdx-error | 3038 + skip-url-blocklist | 2630 + wayback-error | 2599 + petabox-error | 2354 + wayback-content-error | 1617 + blocked-cookie | 1293 + null-body | 1038 + body-too-large | 670 + | 143 + bad-gzip-encoding | 64 + timeout | 47 + spn2-cdx-lookup-failure | 20 + (20 rows) + + +## Crawl Seedlist + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + AND ( + ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'redirect-loop' + OR ingest_file_result.status = 'terminal-bad-status' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'timeout' + OR ingest_file_result.status = 'wayback-content-error' + ) + ) TO '/srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json'; + => COPY 3692846 + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | pv -l \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json + => 3.69M 0:01:19 [46.6k/s] + +This will be used for re-ingest later. For now, extract URLs: + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | jq .base_url -r \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + => 3.66M 0:00:59 [61.8k/s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.rows.json \ + | rg '"terminal_url"' \ + | jq -r .result.terminal_url \ + | rg -v ^null$ \ + | sort -u -S 4G \ + | pv -l \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + => 0.00 0:00:05 [0.00 /s] + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | awk '{print "F+ " $1}' \ + | shuf \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + +What domains are we crawling? + + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt \ + | sort -u -S 4G \ + | cut -d/ -f3 \ + | sort \ + | uniq -c \ + | sort -nr \ + > /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + + head -n20 /srv/sandcrawler/tasks/oai_nocapture_20220921.domains.txt + 91899 raco.cat + 70116 islandora.wrlc.org + 68708 urn.kb.se + 63726 citeseerx.ist.psu.edu + 50370 publications.rwth-aachen.de + 44885 urn.nsk.hr + 38429 server15795.contentdm.oclc.org + 33041 periodicos.ufpb.br + 32519 nbn-resolving.org + 31990 www.ajol.info + 24745 hal.archives-ouvertes.fr + 22569 id.nii.ac.jp + 17239 tilburguniversity.on.worldcat.org + 15873 dspace.nbuv.gov.ua + 15436 digitalcommons.wustl.edu + 14885 www.iiste.org + 14623 www.manchester.ac.uk + 14033 nbn-resolving.de + 13999 opus4.kobv.de + 13689 www.redalyc.org + +Sizes: + + wc -l /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.base_url.txt + 0 /srv/sandcrawler/tasks/oai_nocapture_20220921.terminal_url.txt + 3662864 /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule + + +Copy seedlist to crawler: + + # as regular user + scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp + +## Post-Crawl Bulk Ingest + + # ran 2022-11-16, after crawl cleanup + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -----------------------+--------- + success | 4721164 +1,946,128 + no-pdf-link | 1116290 + no-capture | 673939 + terminal-bad-status | 232217 + link-loop | 148544 + wrong-mimetype | 68841 + redirect-loop | 26262 + empty-blob | 17759 + cdx-error | 6570 + blocked-cookie | 4026 + blocked-wall | 3054 + skip-url-blocklist | 2924 + body-too-large | 2404 + bad-redirect | 1565 + wayback-error | 1320 + petabox-error | 1083 + null-body | 1038 + wayback-content-error | 264 + bad-gzip-encoding | 150 + | 143 + (20 rows) + |