aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--notes/ingest/2022-09_oaipmh.md48
1 files changed, 48 insertions, 0 deletions
diff --git a/notes/ingest/2022-09_oaipmh.md b/notes/ingest/2022-09_oaipmh.md
index 0aa4487..ac7c68f 100644
--- a/notes/ingest/2022-09_oaipmh.md
+++ b/notes/ingest/2022-09_oaipmh.md
@@ -347,3 +347,51 @@ Copy seedlist to crawler:
# as regular user
scp /srv/sandcrawler/tasks/oai_nocapture_20220921.schedule wbgrp-svc206.us.archive.org:/tmp
+## Post-Crawl Bulk Ingest
+
+ # ran 2022-11-16, after crawl cleanup
+ cat /srv/sandcrawler/tasks/oai_nocapture_20220921.ingest_request.json \
+ | rg -v "\\\\" \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ => DONE
+
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'oai'
+ AND date(ingest_request.created) > '2022-09-01'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -----------------------+---------
+ success | 4721164 +1,946,128
+ no-pdf-link | 1116290
+ no-capture | 673939
+ terminal-bad-status | 232217
+ link-loop | 148544
+ wrong-mimetype | 68841
+ redirect-loop | 26262
+ empty-blob | 17759
+ cdx-error | 6570
+ blocked-cookie | 4026
+ blocked-wall | 3054
+ skip-url-blocklist | 2924
+ body-too-large | 2404
+ bad-redirect | 1565
+ wayback-error | 1320
+ petabox-error | 1083
+ null-body | 1038
+ wayback-content-error | 264
+ bad-gzip-encoding | 150
+ | 143
+ (20 rows)
+