From 5b1d9a31e5f0bfd0e2d544d87953043ac8f8f8b9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Nov 2022 16:49:42 -0800 Subject: 2022 OAI-PMH crawl notes update --- notes/ingest/2022-09_oaipmh.md | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'notes') diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md index 0aa4487..ac7c68f 100644 --- a/notes/ingest/2022-09_oaipmh.md +++ b/notes/ingest/2022-09_oaipmh.md @@ -347,3 +347,51 @@ Copy seedlist to crawler: # as regular user scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp +## Post-Crawl Bulk Ingest + + # ran 2022-11-16, after crawl cleanup + cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \ + | rg -v "\\\\" \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND date(ingest_request.created) > '2022-09-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -----------------------+--------- + success | 4721164 +1,946,128 + no-pdf-link | 1116290 + no-capture | 673939 + terminal-bad-status | 232217 + link-loop | 148544 + wrong-mimetype | 68841 + redirect-loop | 26262 + empty-blob | 17759 + cdx-error | 6570 + blocked-cookie | 4026 + blocked-wall | 3054 + skip-url-blocklist | 2924 + body-too-large | 2404 + bad-redirect | 1565 + wayback-error | 1320 + petabox-error | 1083 + null-body | 1038 + wayback-content-error | 264 + bad-gzip-encoding | 150 + | 143 + (20 rows) + -- cgit v1.2.3