aboutsummaryrefslogtreecommitdiffstats
path: root/notes/ingest
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-05 12:54:52 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-05 12:54:52 -0700
commit285a8ad33285ece138b4cd420cbf7c854b712f43 (patch)
tree7da49073b2819010bfeaba939b07a90ea7e17aca /notes/ingest
parentba6d3270a8c2bc9b919ca616da919a54ecb168b7 (diff)
downloadsandcrawler-285a8ad33285ece138b4cd420cbf7c854b712f43.tar.gz
sandcrawler-285a8ad33285ece138b4cd420cbf7c854b712f43.zip
summarize datacite and MAG 2020 crawls
Diffstat (limited to 'notes/ingest')
-rw-r--r--notes/ingest/2020-03-04_mag.md97
-rw-r--r--notes/ingest/2020-04-07_datacite.md103
2 files changed, 200 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md
index 9b000a3..428ce05 100644
--- a/notes/ingest/2020-03-04_mag.md
+++ b/notes/ingest/2020-03-04_mag.md
@@ -477,3 +477,100 @@ Ok, enqueue!
cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+## Final Stats
+
+... for this round of ingest:
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+
+ status | count
+ -------------------------------------+----------
+ success | 18712849
+ redirect-loop | 2008110
+ no-pdf-link | 1337012
+ link-loop | 1326761
+ no-capture | 1030693
+ terminal-bad-status | 637143
+ gateway-timeout | 193194
+ cdx-error | 125907
+ spn2-cdx-lookup-failure | 77842
+ wrong-mimetype | 50882
+ wayback-error | 40278
+ invalid-host-resolution | 35201
+ petabox-error | 11254
+ null-body | 6485
+ spn2-error | 1643
+ spn2-error:job-failed | 747
+ spn2-error:invalid-url-syntax | 325
+ spn2-error:soft-time-limit-exceeded | 190
+ bad-redirect | 77
+ | 47
+ (20 rows)
+
+Failures by domain:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_file_result.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'mag'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ domain | status | count
+ ---------------------------------+---------------------+--------
+ ieeexplore.ieee.org | redirect-loop | 677712
+ cyberleninka.ru | link-loop | 308390
+ papers.ssrn.com | link-loop | 281804
+ ieeexplore.ieee.org | link-loop | 273559
+ dialnet.unirioja.es | redirect-loop | 240504
+ dialnet.unirioja.es | terminal-bad-status | 232481
+ onlinelibrary.wiley.com | no-pdf-link | 220932
+ iopscience.iop.org | terminal-bad-status | 172480
+ validate.perfdrive.com | no-pdf-link | 172312
+ link.springer.com | redirect-loop | 130398
+ agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382
+ iopscience.iop.org | redirect-loop | 105234
+ www.bmj.com | link-loop | 100354
+ www.researchgate.net | redirect-loop | 84366
+ www.cambridge.org | link-loop | 83171
+ jamanetwork.com | no-pdf-link | 75053
+ febs.onlinelibrary.wiley.com | no-pdf-link | 74872
+ www.jstor.org | redirect-loop | 72059
+ journals.sagepub.com | no-pdf-link | 63028
+ science.sciencemag.org | redirect-loop | 62927
+ profile.thieme.de | no-pdf-link | 62406
+ cyberleninka.ru | redirect-loop | 56733
+ link.springer.com | link-loop | 47608
+ physoc.onlinelibrary.wiley.com | no-pdf-link | 30180
+ science.sciencemag.org | link-loop | 29908
+ papers.ssrn.com | redirect-loop | 27255
+ obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789
+ www.computer.org | no-pdf-link | 26444
+ watermark.silverchair.com | terminal-bad-status | 25934
+ www.nature.com | redirect-loop | 25306
+ (30 rows)
diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md
index b0217f0..0fc7e67 100644
--- a/notes/ingest/2020-04-07_datacite.md
+++ b/notes/ingest/2020-04-07_datacite.md
@@ -16,3 +16,106 @@ ingest request file using `fatcat_ingest.py` and set that up for bulk crawling.
cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+## Ingest Stats
+
+Note that this will have a small fraction of non-datacite results mixed in (eg,
+from COVID-19 targeted crawls):
+
+ SELECT ingest_file_result.status, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ GROUP BY status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+ status | count
+ -------------------------------------+---------
+ no-pdf-link | 4646767
+ redirect-loop | 1447229
+ no-capture | 860235
+ success | 849501
+ terminal-bad-status | 174869
+ cdx-error | 159805
+ wayback-error | 18076
+ wrong-mimetype | 11169
+ link-loop | 8410
+ gateway-timeout | 4034
+ spn2-cdx-lookup-failure | 510
+ petabox-error | 339
+ null-body | 251
+ spn2-error | 19
+ spn2-error:job-failed | 14
+ bad-gzip-encoding | 13
+ timeout | 5
+ spn2-error:soft-time-limit-exceeded | 4
+ invalid-host-resolution | 2
+ spn2-error:pending | 1
+ (20 rows)
+
+Top domains/statuses (including success):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE
+ ingest_request.ingest_type = 'pdf'
+ AND ingest_request.link_source = 'doi'
+ AND ingest_request.ingest_type = 'pdf'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ AND created >= '2020-04-07'
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ domain | status | count
+ ---------------------------------------+---------------------+--------
+ ssl.fao.org | no-pdf-link | 862277
+ www.e-periodica.ch | no-pdf-link | 746781
+ www.researchgate.net | redirect-loop | 664524
+ dlc.library.columbia.edu | no-pdf-link | 493111
+ www.die-bonn.de | redirect-loop | 352903
+ figshare.com | no-pdf-link | 319709
+ statisticaldatasets.data-planet.com | no-pdf-link | 309584
+ catalog.paradisec.org.au | redirect-loop | 225396
+ zenodo.org | no-capture | 193201
+ digi.ub.uni-heidelberg.de | no-pdf-link | 184974
+ open.library.ubc.ca | no-pdf-link | 167841
+ zenodo.org | no-pdf-link | 130617
+ www.google.com | no-pdf-link | 111312
+ www.e-manuscripta.ch | no-pdf-link | 79192
+ ds.iris.edu | no-pdf-link | 77649
+ data.inra.fr | no-pdf-link | 69440
+ www.tib.eu | no-pdf-link | 63872
+ www.egms.de | redirect-loop | 53877
+ archaeologydataservice.ac.uk | redirect-loop | 52838
+ d.lib.msu.edu | no-pdf-link | 45297
+ www.e-rara.ch | no-pdf-link | 45163
+ springernature.figshare.com | no-pdf-link | 42527
+ boris.unibe.ch | no-pdf-link | 40816
+ www.research-collection.ethz.ch | no-capture | 40350
+ spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059
+ repository.dri.ie | terminal-bad-status | 32760
+ othes.univie.ac.at | no-pdf-link | 32558
+ repositories.lib.utexas.edu | no-capture | 31526
+ posterng.netkey.at | no-pdf-link | 30315
+ zenodo.org | terminal-bad-status | 29614
+ (30 rows)
+