aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest/2020-03-04_mag.md
diff options
context:
space:
mode:
Diffstat (limited to 'notes/ingest/2020-03-04_mag.md')
-rw-r--r--notes/ingest/2020-03-04_mag.md97
1 files changed, 97 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
index 9b000a3..428ce05 100644
--- a/notes/ingest/2020-03-04_mag.md
+++ b/notes/ingest/2020-03-04_mag.md
@@ -477,3 +477,100 @@ Ok, enqueue!
cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+## Final Stats
+
+... for this round of ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+----------
+ success | 18712849
+ redirect-loop | 2008110
+ no-pdf-link | 1337012
+ link-loop | 1326761
+ no-capture | 1030693
+ terminal-bad-status | 637143
+ gateway-timeout | 193194
+ cdx-error | 125907
+ spn2-cdx-lookup-failure | 77842
+ wrong-mimetype | 50882
+ wayback-error | 40278
+ invalid-host-resolution | 35201
+ petabox-error | 11254
+ null-body | 6485
+ spn2-error | 1643
+ spn2-error:job-failed | 747
+ spn2-error:invalid-url-syntax | 325
+ spn2-error:soft-time-limit-exceeded | 190
+ bad-redirect | 77
+ | 47
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ domain | status | count
+ ---------------------------------+---------------------+--------
+ ieeexplore.ieee.org | redirect-loop | 677712
+ cyberleninka.ru | link-loop | 308390
+ papers.ssrn.com | link-loop | 281804
+ ieeexplore.ieee.org | link-loop | 273559
+ dialnet.unirioja.es | redirect-loop | 240504
+ dialnet.unirioja.es | terminal-bad-status | 232481
+ onlinelibrary.wiley.com | no-pdf-link | 220932
+ iopscience.iop.org | terminal-bad-status | 172480
+ validate.perfdrive.com | no-pdf-link | 172312
+ link.springer.com | redirect-loop | 130398
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382
+ iopscience.iop.org | redirect-loop | 105234
+ www.bmj.com | link-loop | 100354
+ www.researchgate.net | redirect-loop | 84366
+ www.cambridge.org | link-loop | 83171
+ jamanetwork.com | no-pdf-link | 75053
+ febs.onlinelibrary.wiley.com | no-pdf-link | 74872
+ www.jstor.org | redirect-loop | 72059
+ journals.sagepub.com | no-pdf-link | 63028
+ science.sciencemag.org | redirect-loop | 62927
+ profile.thieme.de | no-pdf-link | 62406
+ cyberleninka.ru | redirect-loop | 56733
+ link.springer.com | link-loop | 47608
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 30180
+ science.sciencemag.org | link-loop | 29908
+ papers.ssrn.com | redirect-loop | 27255
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789
+ www.computer.org | no-pdf-link | 26444
+ watermark.silverchair.com | terminal-bad-status | 25934
+ www.nature.com | redirect-loop | 25306
+ (30 rows)