aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-05_oai_pmh.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2020-05_oai_pmh.md')
-rw-r--r--notes/ingest/2020-05_oai_pmh.md428
1 files changed, 428 insertions, 0 deletions
diff --git a/notes/ingest/2020-05_oai_pmh.md b/notes/ingest/2020-05_oai_pmh.md
new file mode 100644
index 0000000..fe22c75
--- /dev/null
+++ b/notes/ingest/2020-05_oai_pmh.md
@@ -0,0 +1,428 @@
+
+Primary Goal: start large crawl of OAI landing pages that we haven't seen
+
+Fields of interest for ingest:
+- oai identifer
+- doi
+- formats
+- urls (maybe also "relations")
+- types (type+stage)
+
+## Other Tasks
+
+About 150 million total lines.
+
+Types coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.types != null) | .types[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > types_counts.txt
+
+Dump all ISSNs, with counts, quick check how many are in chocula/fatcat
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.issn != null) | .issn[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > issn_counts.txt
+
+Language coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.languages != null) | .languages[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > languages_counts.txt
+
+Format coverage
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.formats != null) | .formats[]" -r | sort -S 5G | uniq -c | sort -nr -S 1G > formats_counts.txt
+ => 150M 0:56:14 [44.7k/s]
+
+Have a DOI?
+
+ zstdcat oai.ndjson.zst | pv -l | rg '"doi":' | rg '"10.' | wc -l
+ => 16,013,503
+
+ zstdcat oai.ndjson.zst | pv -l | jq "select(.doi != null) | .doi[]" -r | sort -u -S 5G > doi_raw.txt
+ => 11,940,950
+
+## Transform, Load, Bulk Ingest
+
+ zstdcat oai.ndjson.zst | ./oai2ingestrequest.py - | pv -l | gzip > oai.202002.requests.json.gz
+ => 80M 6:36:55 [3.36k/s]
+
+ time zcat /schnell/oai-pmh/oai.202002.requests.json.gz | pv -l | ./persist_tool.py ingest-request -
+ => 80M 4:00:21 [5.55k/s]
+ => Worker: Counter({'total': 80013963, 'insert-requests': 51169081, 'update-requests': 0})
+ => JSON lines pushed: Counter({'pushed': 80013963, 'total': 80013963})
+
+ => real 240m21.207s
+ => user 85m12.576s
+ => sys 3m29.580s
+
+ select count(*) from ingest_request where ingest_type = 'pdf' and link_source = 'oai';
+ => 51,185,088
+
+Why so many (30 million) skipped? Not unique?
+
+ zcat oai.202002.requests.json.gz | jq '[.link_source_id, .base_url]' -c | sort -u -S 4G | wc -l
+ => 51,185,088
+
+ zcat oai.202002.requests.json.gz | jq .base_url -r | pv -l | sort -u -S 4G > request_url.txt
+ wc -l request_url.txt
+ => 50,002,674 request_url.txt
+
+ zcat oai.202002.requests.json.gz | jq .link_source_id -r | pv -l | sort -u -S 4G > requires_oai.txt
+ wc -l requires_oai.txt
+ => 34,622,083 requires_oai.txt
+
+Yup, tons of duplication. And remember this is exact URL, not SURT or similar.
+
+How many of these are URLs we have seen and ingested already?
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ | 49491452
+ success | 1469113
+ no-capture | 134611
+ redirect-loop | 59666
+ no-pdf-link | 8947
+ cdx-error | 7561
+ terminal-bad-status | 6704
+ null-body | 5042
+ wrong-mimetype | 879
+ wayback-error | 722
+ petabox-error | 198
+ gateway-timeout | 86
+ link-loop | 51
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ spn2-error | 4
+ bad-gzip-encoding | 4
+ spn2-error:job-failed | 2
+ (18 rows)
+
+Dump ingest requests:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND ingest_file_result.status IS NULL
+ ) TO '/grande/snapshots/oai_noingest_20200506.rows.json';
+ => COPY 49491452
+
+ WARNING: should have transformed from rows to requests here
+
+ cat /grande/snapshots/oai_noingest_20200506.rows.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+## Crawl and re-ingest
+
+Updated stats after ingest (NOTE: ingest requests not really formed correctly,
+but doesn't matter because fatcat wasn't importing these anyways):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 42565875
+ success | 5227609
+ no-pdf-link | 2156341
+ redirect-loop | 559721
+ cdx-error | 260446
+ wrong-mimetype | 148871
+ terminal-bad-status | 109725
+ link-loop | 92792
+ null-body | 30688
+ | 15287
+ petabox-error | 11109
+ wayback-error | 6261
+ skip-url-blocklist | 184
+ gateway-timeout | 86
+ bad-gzip-encoding | 25
+ invalid-host-resolution | 24
+ spn2-cdx-lookup-failure | 22
+ bad-redirect | 15
+ spn2-error | 4
+ spn2-error:job-failed | 2
+ (20 rows)
+
+Dump again for crawling:
+
+ COPY (
+ SELECT row_to_json(ingest_request.*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2020-05-01'
+ AND (ingest_file_result.status = 'no-capture' or ingest_file_result.status = 'cdx-error')
+ ) TO '/grande/snapshots/oai_tocrawl_20200526.rows.json';
+
+Notes about crawl setup are in `journal-crawls` repo. Excluded the following domains:
+
+ 4876135 www.kb.dk REMOVE: too large and generic
+ 3110009 kb-images.kb.dk REMOVE: dead?
+ 1274638 mdz-nbn-resolving.de REMOVE: maybe broken
+ 982312 aggr.ukm.um.si REMOVE: maybe broken
+
+And went from about 42,826,313 rows to 31,773,874 unique URLs to crawl, so
+expecting at least 11,052,439 `no-capture` ingest results (and should probably
+filter for these or even delete from the ingest request table).
+
+Ingest progress:
+
+ 2020-08-05 14:02: 32,571,018
+ 2020-08-06 13:49: 31,195,169
+ 2020-08-07 10:11: 29,986,169
+ 2020-08-10 10:43: 26,497,196
+ 2020-08-12 11:02: 23,811,845
+ 2020-08-17 13:34: 19,460,502
+ 2020-08-20 09:49: 15,069,507
+ 2020-08-25 09:56: 9,397,035
+ 2020-09-02 15:02: 305,889 (72k longest queue)
+ 2020-09-03 14:30: done
+
+## Post-ingest stats
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------+----------
+ no-capture | 16804277
+ no-pdf-link | 14895249
+ success | 13898603
+ redirect-loop | 2709730
+ cdx-error | 827024
+ terminal-bad-status | 740037
+ wrong-mimetype | 604242
+ link-loop | 532553
+ null-body | 95721
+ wayback-error | 41864
+ petabox-error | 19204
+ | 15287
+ gateway-timeout | 510
+ bad-redirect | 318
+ skip-url-blocklist | 184
+ bad-gzip-encoding | 114
+ timeout | 78
+ spn2-cdx-lookup-failure | 59
+ invalid-host-resolution | 19
+ blocked-cookie | 6
+ (20 rows)
+
+Hrm, +8 million or so 'success', but that is a lot of no-capture. May be worth
+dumping the full kafka result topic, filter to OAI requests, and extracting the
+missing URLs.
+
+Top counts by OAI prefix:
+
+ SELECT
+ oai_prefix,
+ COUNT(CASE WHEN status = 'success' THEN 1 END) as success,
+ COUNT(*) as total
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix
+ ORDER BY total DESC
+ LIMIT 25;
+
+ oai_prefix | success | total
+ --------------------------+---------+---------
+ kb.dk | 0 | 7989412 (excluded)
+ repec | 1118591 | 2783448
+ bnf.fr | 0 | 2187277
+ hispana.mcu.es | 19404 | 1492639
+ bdr.oai.bsb-muenchen.de | 73 | 1319882 (excluded?)
+ hal | 564700 | 1049607
+ ukm.si | 0 | 982468 (excluded)
+ hsp.org | 0 | 810281
+ www.irgrid.ac.cn | 17578 | 748828
+ cds.cern.ch | 72811 | 688091
+ americanae.aecid.es | 69678 | 572792
+ biodiversitylibrary.org | 2121 | 566154
+ juser.fz-juelich.de | 22777 | 518551
+ espace.library.uq.edu.au | 6494 | 508960
+ igi.indrastra.com | 58689 | 478577
+ archive.ugent.be | 63654 | 424014
+ hrcak.srce.hr | 395031 | 414897
+ zir.nsk.hr | 153889 | 397200
+ renati.sunedu.gob.pe | 78399 | 388355
+ hypotheses.org | 3 | 374296
+ rour.neicon.ru | 7963 | 354529
+ generic.eprints.org | 261221 | 340470
+ invenio.nusl.cz | 6184 | 325867
+ evastar-karlsruhe.de | 62044 | 317952
+ quod.lib.umich.edu | 5 | 309135
+ (25 rows)
+
+Top counts by OAI prefix and status:
+
+ SELECT
+ oai_prefix,
+ status,
+ COUNT((oai_prefix,status))
+ FROM (
+ SELECT
+ ingest_file_result.status as status,
+ -- eg "oai:cwi.nl:4881"
+ substring(ingest_request.link_source_id FROM 'oai:([^:]+):.*') AS oai_prefix
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ GROUP BY oai_prefix, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ oai_prefix | status | count
+ --------------------------+---------------+---------
+ kb.dk | no-capture | 7955231 (excluded)
+ bdr.oai.bsb-muenchen.de | no-capture | 1270209 (excluded?)
+ repec | success | 1118591
+ hispana.mcu.es | no-pdf-link | 1118092
+ bnf.fr | no-capture | 1100591
+ ukm.si | no-capture | 976004 (excluded)
+ hsp.org | no-pdf-link | 773496
+ repec | no-pdf-link | 625629
+ bnf.fr | no-pdf-link | 607813
+ hal | success | 564700
+ biodiversitylibrary.org | no-pdf-link | 531409
+ cds.cern.ch | no-capture | 529842
+ repec | redirect-loop | 504393
+ juser.fz-juelich.de | no-pdf-link | 468813
+ bnf.fr | redirect-loop | 436087
+ americanae.aecid.es | no-pdf-link | 409954
+ hrcak.srce.hr | success | 395031
+ www.irgrid.ac.cn | no-pdf-link | 362087
+ hal | no-pdf-link | 352111
+ www.irgrid.ac.cn | no-capture | 346963
+ espace.library.uq.edu.au | no-pdf-link | 315302
+ igi.indrastra.com | no-pdf-link | 312087
+ repec | no-capture | 309882
+ invenio.nusl.cz | no-pdf-link | 302657
+ hypotheses.org | no-pdf-link | 298750
+ rour.neicon.ru | redirect-loop | 291922
+ renati.sunedu.gob.pe | no-capture | 276388
+ t2r2.star.titech.ac.jp | no-pdf-link | 264109
+ generic.eprints.org | success | 261221
+ quod.lib.umich.edu | no-pdf-link | 253937
+ (30 rows)
+
+If we remove excluded prefixes, and some large/generic prefixes (bnf.fr,
+hispana.mcu.es, hsp.org), then the aggregate counts are:
+
+ no-capture | 16,804,277 -> 5,502,242
+ no-pdf-link | 14,895,249 -> 12,395,848
+
+Top status by terminal domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ ) t1
+ WHERE t1.domain != ''
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ----------------------------------+---------------+--------
+ hispana.mcu.es | no-pdf-link | 709701 (national scope)
+ gallica.bnf.fr | no-pdf-link | 601193 (national scope)
+ discover.hsp.org | no-pdf-link | 524212 (historical)
+ www.biodiversitylibrary.org | no-pdf-link | 479288
+ gallica.bnf.fr | redirect-loop | 435981 (national scope)
+ hrcak.srce.hr | success | 389673
+ hemerotecadigital.bne.es | no-pdf-link | 359243
+ juser.fz-juelich.de | no-pdf-link | 345112
+ espace.library.uq.edu.au | no-pdf-link | 304299
+ invenio.nusl.cz | no-pdf-link | 302586
+ igi.indrastra.com | no-pdf-link | 292006
+ openrepository.ru | redirect-loop | 291555
+ hal.archives-ouvertes.fr | success | 278134
+ t2r2.star.titech.ac.jp | no-pdf-link | 263971
+ bib-pubdb1.desy.de | no-pdf-link | 254879
+ quod.lib.umich.edu | no-pdf-link | 250382
+ encounters.hsp.org | no-pdf-link | 248132
+ americanae.aecid.es | no-pdf-link | 245295
+ www.irgrid.ac.cn | no-pdf-link | 242496
+ publikationen.bibliothek.kit.edu | no-pdf-link | 222041
+ www.sciencedirect.com | no-pdf-link | 211756
+ dialnet.unirioja.es | redirect-loop | 203615
+ edoc.mpg.de | no-pdf-link | 195526
+ bibliotecadigital.jcyl.es | no-pdf-link | 184671
+ hal.archives-ouvertes.fr | no-pdf-link | 183809
+ www.sciencedirect.com | redirect-loop | 173439
+ lup.lub.lu.se | no-pdf-link | 165788
+ orbi.uliege.be | no-pdf-link | 158313
+ www.erudit.org | success | 155986
+ lib.dr.iastate.edu | success | 153384
+ (30 rows)
+
+Follow-ups are TBD but could include:
+- crawling the ~5m no-capture links directly (eg, not `base_url`) from the
+ ingest result JSON, while retaining the ingest request for later re-ingest
+- investigating and iterating on PDF link extraction, both for large platforms
+ and randomly sampled from long tail
+- classifying OAI prefixes by type (subject repository, institutional
+ repository, journal, national-library, historical docs, greylit, law, etc)
+- running pdftrio over some/all of this corpus