diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-03 16:36:35 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-03 16:36:35 -0700 |
commit | 62252a6179953ccc79a6cb60c40a756fa0a034e1 (patch) | |
tree | 0659bb9988590d67bb11f2772c5245a93a37bfe2 /notes/ingest | |
parent | 07754188a6003a8f4cce1c6012b2a05df49269f8 (diff) | |
download | sandcrawler-62252a6179953ccc79a6cb60c40a756fa0a034e1.tar.gz sandcrawler-62252a6179953ccc79a6cb60c40a756fa0a034e1.zip |
OAI-PMH ingest notes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2020-05_oai_pmh.md | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md index 2f20415..de9bfba 100644 --- a/notes/ingest/2020-05_oai_pmh.md +++ b/notes/ingest/2020-05_oai_pmh.md @@ -181,3 +181,235 @@ Dump again for crawling: AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; +Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains: + + 4876135 www.kb.dk REMOVE: too large and generic + 3110009 kb-images.kb.dk REMOVE: dead? + 1274638 mdz-nbn-resolving.de REMOVE: maybe broken + 982312 aggr.ukm.um.si REMOVE: maybe broken + +And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so +expecting at least 11,052,439 `no-capture` ingest results (and should probably +filter for these or even delete from the ingest request table). + +## Post-ingest stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 16804277 + no-pdf-link | 14895249 + success | 13898603 + redirect-loop | 2709730 + cdx-error | 827024 + terminal-bad-status | 740037 + wrong-mimetype | 604242 + link-loop | 532553 + null-body | 95721 + wayback-error | 41864 + petabox-error | 19204 + | 15287 + gateway-timeout | 510 + bad-redirect | 318 + skip-url-blocklist | 184 + bad-gzip-encoding | 114 + timeout | 78 + spn2-cdx-lookup-failure | 59 + invalid-host-resolution | 19 + blocked-cookie | 6 + (20 rows) + +Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth +dumping the full kafka result topic, filter to OAI requests, and extracting the +missing URLs. + +Top counts by OAI prefix: + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 25; + + oai_prefix | success | total + --------------------------+---------+--------- + kb.dk | 0 | 7989412 (excluded) + repec | 1118591 | 2783448 + bnf.fr | 0 | 2187277 + hispana.mcu.es | 19404 | 1492639 + bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?) + hal | 564700 | 1049607 + ukm.si | 0 | 982468 (excluded) + hsp.org | 0 | 810281 + www.irgrid.ac.cn | 17578 | 748828 + cds.cern.ch | 72811 | 688091 + americanae.aecid.es | 69678 | 572792 + biodiversitylibrary.org | 2121 | 566154 + juser.fz-juelich.de | 22777 | 518551 + espace.library.uq.edu.au | 6494 | 508960 + igi.indrastra.com | 58689 | 478577 + archive.ugent.be | 63654 | 424014 + hrcak.srce.hr | 395031 | 414897 + zir.nsk.hr | 153889 | 397200 + renati.sunedu.gob.pe | 78399 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7963 | 354529 + generic.eprints.org | 261221 | 340470 + invenio.nusl.cz | 6184 | 325867 + evastar-karlsruhe.de | 62044 | 317952 + quod.lib.umich.edu | 5 | 309135 + (25 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 30; + + + oai_prefix | status | count + --------------------------+---------------+--------- + kb.dk | no-capture | 7955231 (excluded) + bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?) + repec | success | 1118591 + hispana.mcu.es | no-pdf-link | 1118092 + bnf.fr | no-capture | 1100591 + ukm.si | no-capture | 976004 (excluded) + hsp.org | no-pdf-link | 773496 + repec | no-pdf-link | 625629 + bnf.fr | no-pdf-link | 607813 + hal | success | 564700 + biodiversitylibrary.org | no-pdf-link | 531409 + cds.cern.ch | no-capture | 529842 + repec | redirect-loop | 504393 + juser.fz-juelich.de | no-pdf-link | 468813 + bnf.fr | redirect-loop | 436087 + americanae.aecid.es | no-pdf-link | 409954 + hrcak.srce.hr | success | 395031 + www.irgrid.ac.cn | no-pdf-link | 362087 + hal | no-pdf-link | 352111 + www.irgrid.ac.cn | no-capture | 346963 + espace.library.uq.edu.au | no-pdf-link | 315302 + igi.indrastra.com | no-pdf-link | 312087 + repec | no-capture | 309882 + invenio.nusl.cz | no-pdf-link | 302657 + hypotheses.org | no-pdf-link | 298750 + rour.neicon.ru | redirect-loop | 291922 + renati.sunedu.gob.pe | no-capture | 276388 + t2r2.star.titech.ac.jp | no-pdf-link | 264109 + generic.eprints.org | success | 261221 + quod.lib.umich.edu | no-pdf-link | 253937 + (30 rows) + +If we remove excluded prefixes, and some large/generic prefixes (bnf.fr, +hispana.mcu.es, hsp.org), then the aggregate counts are: + + no-capture | 16,804,277 -> 5,502,242 + no-pdf-link | 14,895,249 -> 12,395,848 + +Top status by terminal domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + WHERE t1.domain != '' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ----------------------------------+---------------+-------- + hispana.mcu.es | no-pdf-link | 709701 (national scope) + gallica.bnf.fr | no-pdf-link | 601193 (national scope) + discover.hsp.org | no-pdf-link | 524212 (historical) + www.biodiversitylibrary.org | no-pdf-link | 479288 + gallica.bnf.fr | redirect-loop | 435981 (national scope) + hrcak.srce.hr | success | 389673 + hemerotecadigital.bne.es | no-pdf-link | 359243 + juser.fz-juelich.de | no-pdf-link | 345112 + espace.library.uq.edu.au | no-pdf-link | 304299 + invenio.nusl.cz | no-pdf-link | 302586 + igi.indrastra.com | no-pdf-link | 292006 + openrepository.ru | redirect-loop | 291555 + hal.archives-ouvertes.fr | success | 278134 + t2r2.star.titech.ac.jp | no-pdf-link | 263971 + bib-pubdb1.desy.de | no-pdf-link | 254879 + quod.lib.umich.edu | no-pdf-link | 250382 + encounters.hsp.org | no-pdf-link | 248132 + americanae.aecid.es | no-pdf-link | 245295 + www.irgrid.ac.cn | no-pdf-link | 242496 + publikationen.bibliothek.kit.edu | no-pdf-link | 222041 + www.sciencedirect.com | no-pdf-link | 211756 + dialnet.unirioja.es | redirect-loop | 203615 + edoc.mpg.de | no-pdf-link | 195526 + bibliotecadigital.jcyl.es | no-pdf-link | 184671 + hal.archives-ouvertes.fr | no-pdf-link | 183809 + www.sciencedirect.com | redirect-loop | 173439 + lup.lub.lu.se | no-pdf-link | 165788 + orbi.uliege.be | no-pdf-link | 158313 + www.erudit.org | success | 155986 + lib.dr.iastate.edu | success | 153384 + (30 rows) + +Follow-ups are TBD but could include: +- crawling the ~5m no-capture links directly (eg, not `base_url`) from the + ingest result JSON, while retaining the ingest request for later re-ingest +- investigating and iterating on PDF link extraction, both for large platforms + and randomly sampled from long tail +- classifying OAI prefixes by type (subject repository, institutional + repository, journal, national-library, historical docs, greylit, law, etc) +- running pdftrio over some/all of this corpus |