diff options
Diffstat (limited to 'notes/ingest/2020-05_oai_pmh.md')
-rw-r--r-- | notes/ingest/2020-05_oai_pmh.md | 428 |
1 files changed, 428 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md new file mode 100644 index 0000000..fe22c75 --- /dev/null +++ b/notes/ingest/2020-05_oai_pmh.md @@ -0,0 +1,428 @@ + +Primary Goal: start large crawl of OAI landing pages that we haven't seen + +Fields of interest for ingest: +- oai identifer +- doi +- formats +- urls (maybe also "relations") +- types (type+stage) + +## Other Tasks + +About 150 million total lines. + +Types coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt + +Dump all ISSNs, with counts, quick check how many are in chocula/fatcat + + zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt + +Language coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt + +Format coverage + + zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt + => 150M 0:56:14 [44.7k/s] + +Have a DOI? + + zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l + => 16,013,503 + + zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt + => 11,940,950 + +## Transform, Load, Bulk Ingest + + zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz + => 80M 6:36:55 [3.36k/s] + + time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request - + => 80M 4:00:21 [5.55k/s] + => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0}) + => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963}) + + => real 240m21.207s + => user 85m12.576s + => sys 3m29.580s + + select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai'; + => 51,185,088 + +Why so many (30 million) skipped? Not unique? + + zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l + => 51,185,088 + + zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt + wc -l request_url.txt + => 50,002,674 request_url.txt + + zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt + wc -l requires_oai.txt + => 34,622,083 requires_oai.txt + +Yup, tons of duplication. And remember this is exact URL, not SURT or similar. + +How many of these are URLs we have seen and ingested already? + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + | 49491452 + success | 1469113 + no-capture | 134611 + redirect-loop | 59666 + no-pdf-link | 8947 + cdx-error | 7561 + terminal-bad-status | 6704 + null-body | 5042 + wrong-mimetype | 879 + wayback-error | 722 + petabox-error | 198 + gateway-timeout | 86 + link-loop | 51 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + spn2-error | 4 + bad-gzip-encoding | 4 + spn2-error:job-failed | 2 + (18 rows) + +Dump ingest requests: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND ingest_file_result.status IS NULL + ) TO '/grande/snapshots/oai_noingest_20200506.rows.json'; + => COPY 49491452 + + WARNING: should have transformed from rows to requests here + + cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +## Crawl and re-ingest + +Updated stats after ingest (NOTE: ingest requests not really formed correctly, +but doesn't matter because fatcat wasn't importing these anyways): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 42565875 + success | 5227609 + no-pdf-link | 2156341 + redirect-loop | 559721 + cdx-error | 260446 + wrong-mimetype | 148871 + terminal-bad-status | 109725 + link-loop | 92792 + null-body | 30688 + | 15287 + petabox-error | 11109 + wayback-error | 6261 + skip-url-blocklist | 184 + gateway-timeout | 86 + bad-gzip-encoding | 25 + invalid-host-resolution | 24 + spn2-cdx-lookup-failure | 22 + bad-redirect | 15 + spn2-error | 4 + spn2-error:job-failed | 2 + (20 rows) + +Dump again for crawling: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2020-05-01' + AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error') + ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json'; + +Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains: + + 4876135 www.kb.dk REMOVE: too large and generic + 3110009 kb-images.kb.dk REMOVE: dead? + 1274638 mdz-nbn-resolving.de REMOVE: maybe broken + 982312 aggr.ukm.um.si REMOVE: maybe broken + +And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so +expecting at least 11,052,439 `no-capture` ingest results (and should probably +filter for these or even delete from the ingest request table). + +Ingest progress: + + 2020-08-05 14:02: 32,571,018 + 2020-08-06 13:49: 31,195,169 + 2020-08-07 10:11: 29,986,169 + 2020-08-10 10:43: 26,497,196 + 2020-08-12 11:02: 23,811,845 + 2020-08-17 13:34: 19,460,502 + 2020-08-20 09:49: 15,069,507 + 2020-08-25 09:56: 9,397,035 + 2020-09-02 15:02: 305,889 (72k longest queue) + 2020-09-03 14:30: done + +## Post-ingest stats + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + no-capture | 16804277 + no-pdf-link | 14895249 + success | 13898603 + redirect-loop | 2709730 + cdx-error | 827024 + terminal-bad-status | 740037 + wrong-mimetype | 604242 + link-loop | 532553 + null-body | 95721 + wayback-error | 41864 + petabox-error | 19204 + | 15287 + gateway-timeout | 510 + bad-redirect | 318 + skip-url-blocklist | 184 + bad-gzip-encoding | 114 + timeout | 78 + spn2-cdx-lookup-failure | 59 + invalid-host-resolution | 19 + blocked-cookie | 6 + (20 rows) + +Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth +dumping the full kafka result topic, filter to OAI requests, and extracting the +missing URLs. + +Top counts by OAI prefix: + + SELECT + oai_prefix, + COUNT(CASE WHEN status = 'success' THEN 1 END) as success, + COUNT(*) as total + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix + ORDER BY total DESC + LIMIT 25; + + oai_prefix | success | total + --------------------------+---------+--------- + kb.dk | 0 | 7989412 (excluded) + repec | 1118591 | 2783448 + bnf.fr | 0 | 2187277 + hispana.mcu.es | 19404 | 1492639 + bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?) + hal | 564700 | 1049607 + ukm.si | 0 | 982468 (excluded) + hsp.org | 0 | 810281 + www.irgrid.ac.cn | 17578 | 748828 + cds.cern.ch | 72811 | 688091 + americanae.aecid.es | 69678 | 572792 + biodiversitylibrary.org | 2121 | 566154 + juser.fz-juelich.de | 22777 | 518551 + espace.library.uq.edu.au | 6494 | 508960 + igi.indrastra.com | 58689 | 478577 + archive.ugent.be | 63654 | 424014 + hrcak.srce.hr | 395031 | 414897 + zir.nsk.hr | 153889 | 397200 + renati.sunedu.gob.pe | 78399 | 388355 + hypotheses.org | 3 | 374296 + rour.neicon.ru | 7963 | 354529 + generic.eprints.org | 261221 | 340470 + invenio.nusl.cz | 6184 | 325867 + evastar-karlsruhe.de | 62044 | 317952 + quod.lib.umich.edu | 5 | 309135 + (25 rows) + +Top counts by OAI prefix and status: + + SELECT + oai_prefix, + status, + COUNT((oai_prefix,status)) + FROM ( + SELECT + ingest_file_result.status as status, + -- eg "oai:cwi.nl:4881" + substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + GROUP BY oai_prefix, status + ORDER BY COUNT DESC + LIMIT 30; + + + oai_prefix | status | count + --------------------------+---------------+--------- + kb.dk | no-capture | 7955231 (excluded) + bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?) + repec | success | 1118591 + hispana.mcu.es | no-pdf-link | 1118092 + bnf.fr | no-capture | 1100591 + ukm.si | no-capture | 976004 (excluded) + hsp.org | no-pdf-link | 773496 + repec | no-pdf-link | 625629 + bnf.fr | no-pdf-link | 607813 + hal | success | 564700 + biodiversitylibrary.org | no-pdf-link | 531409 + cds.cern.ch | no-capture | 529842 + repec | redirect-loop | 504393 + juser.fz-juelich.de | no-pdf-link | 468813 + bnf.fr | redirect-loop | 436087 + americanae.aecid.es | no-pdf-link | 409954 + hrcak.srce.hr | success | 395031 + www.irgrid.ac.cn | no-pdf-link | 362087 + hal | no-pdf-link | 352111 + www.irgrid.ac.cn | no-capture | 346963 + espace.library.uq.edu.au | no-pdf-link | 315302 + igi.indrastra.com | no-pdf-link | 312087 + repec | no-capture | 309882 + invenio.nusl.cz | no-pdf-link | 302657 + hypotheses.org | no-pdf-link | 298750 + rour.neicon.ru | redirect-loop | 291922 + renati.sunedu.gob.pe | no-capture | 276388 + t2r2.star.titech.ac.jp | no-pdf-link | 264109 + generic.eprints.org | success | 261221 + quod.lib.umich.edu | no-pdf-link | 253937 + (30 rows) + +If we remove excluded prefixes, and some large/generic prefixes (bnf.fr, +hispana.mcu.es, hsp.org), then the aggregate counts are: + + no-capture | 16,804,277 -> 5,502,242 + no-pdf-link | 14,895,249 -> 12,395,848 + +Top status by terminal domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + ) t1 + WHERE t1.domain != '' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ----------------------------------+---------------+-------- + hispana.mcu.es | no-pdf-link | 709701 (national scope) + gallica.bnf.fr | no-pdf-link | 601193 (national scope) + discover.hsp.org | no-pdf-link | 524212 (historical) + www.biodiversitylibrary.org | no-pdf-link | 479288 + gallica.bnf.fr | redirect-loop | 435981 (national scope) + hrcak.srce.hr | success | 389673 + hemerotecadigital.bne.es | no-pdf-link | 359243 + juser.fz-juelich.de | no-pdf-link | 345112 + espace.library.uq.edu.au | no-pdf-link | 304299 + invenio.nusl.cz | no-pdf-link | 302586 + igi.indrastra.com | no-pdf-link | 292006 + openrepository.ru | redirect-loop | 291555 + hal.archives-ouvertes.fr | success | 278134 + t2r2.star.titech.ac.jp | no-pdf-link | 263971 + bib-pubdb1.desy.de | no-pdf-link | 254879 + quod.lib.umich.edu | no-pdf-link | 250382 + encounters.hsp.org | no-pdf-link | 248132 + americanae.aecid.es | no-pdf-link | 245295 + www.irgrid.ac.cn | no-pdf-link | 242496 + publikationen.bibliothek.kit.edu | no-pdf-link | 222041 + www.sciencedirect.com | no-pdf-link | 211756 + dialnet.unirioja.es | redirect-loop | 203615 + edoc.mpg.de | no-pdf-link | 195526 + bibliotecadigital.jcyl.es | no-pdf-link | 184671 + hal.archives-ouvertes.fr | no-pdf-link | 183809 + www.sciencedirect.com | redirect-loop | 173439 + lup.lub.lu.se | no-pdf-link | 165788 + orbi.uliege.be | no-pdf-link | 158313 + www.erudit.org | success | 155986 + lib.dr.iastate.edu | success | 153384 + (30 rows) + +Follow-ups are TBD but could include: +- crawling the ~5m no-capture links directly (eg, not `base_url`) from the + ingest result JSON, while retaining the ingest request for later re-ingest +- investigating and iterating on PDF link extraction, both for large platforms + and randomly sampled from long tail +- classifying OAI prefixes by type (subject repository, institutional + repository, journal, national-library, historical docs, greylit, law, etc) +- running pdftrio over some/all of this corpus |