From 285a8ad33285ece138b4cd420cbf7c854b712f43 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 5 May 2020 12:54:52 -0700 Subject: summarize datacite and MAG 2020 crawls --- notes/ingest/2020-03-04_mag.md | 97 +++++++++++++++++++++++++++++++++ notes/ingest/2020-04-07_datacite.md | 103 ++++++++++++++++++++++++++++++++++++ 2 files changed, 200 insertions(+) (limited to 'notes') diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md index 9b000a3..428ce05 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03-04_mag.md @@ -477,3 +477,100 @@ Ok, enqueue! cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 +## Final Stats + +... for this round of ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------------------+---------- + success | 18712849 + redirect-loop | 2008110 + no-pdf-link | 1337012 + link-loop | 1326761 + no-capture | 1030693 + terminal-bad-status | 637143 + gateway-timeout | 193194 + cdx-error | 125907 + spn2-cdx-lookup-failure | 77842 + wrong-mimetype | 50882 + wayback-error | 40278 + invalid-host-resolution | 35201 + petabox-error | 11254 + null-body | 6485 + spn2-error | 1643 + spn2-error:job-failed | 747 + spn2-error:invalid-url-syntax | 325 + spn2-error:soft-time-limit-exceeded | 190 + bad-redirect | 77 + | 47 + (20 rows) + +Failures by domain: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + domain | status | count + ---------------------------------+---------------------+-------- + ieeexplore.ieee.org | redirect-loop | 677712 + cyberleninka.ru | link-loop | 308390 + papers.ssrn.com | link-loop | 281804 + ieeexplore.ieee.org | link-loop | 273559 + dialnet.unirioja.es | redirect-loop | 240504 + dialnet.unirioja.es | terminal-bad-status | 232481 + onlinelibrary.wiley.com | no-pdf-link | 220932 + iopscience.iop.org | terminal-bad-status | 172480 + validate.perfdrive.com | no-pdf-link | 172312 + link.springer.com | redirect-loop | 130398 + agupubs.onlinelibrary.wiley.com | no-pdf-link | 113382 + iopscience.iop.org | redirect-loop | 105234 + www.bmj.com | link-loop | 100354 + www.researchgate.net | redirect-loop | 84366 + www.cambridge.org | link-loop | 83171 + jamanetwork.com | no-pdf-link | 75053 + febs.onlinelibrary.wiley.com | no-pdf-link | 74872 + www.jstor.org | redirect-loop | 72059 + journals.sagepub.com | no-pdf-link | 63028 + science.sciencemag.org | redirect-loop | 62927 + profile.thieme.de | no-pdf-link | 62406 + cyberleninka.ru | redirect-loop | 56733 + link.springer.com | link-loop | 47608 + physoc.onlinelibrary.wiley.com | no-pdf-link | 30180 + science.sciencemag.org | link-loop | 29908 + papers.ssrn.com | redirect-loop | 27255 + obgyn.onlinelibrary.wiley.com | no-pdf-link | 26789 + www.computer.org | no-pdf-link | 26444 + watermark.silverchair.com | terminal-bad-status | 25934 + www.nature.com | redirect-loop | 25306 + (30 rows) diff --git a/notes/ingest/2020-04-07_datacite.md b/notes/ingest/2020-04-07_datacite.md index b0217f0..0fc7e67 100644 --- a/notes/ingest/2020-04-07_datacite.md +++ b/notes/ingest/2020-04-07_datacite.md @@ -16,3 +16,106 @@ ingest request file using `fatcat_ingest.py` and set that up for bulk crawling. cat /srv/fatcat/snapshots/datacite_papers_20200407.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +## Ingest Stats + +Note that this will have a small fraction of non-datacite results mixed in (eg, +from COVID-19 targeted crawls): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+--------- + no-pdf-link | 4646767 + redirect-loop | 1447229 + no-capture | 860235 + success | 849501 + terminal-bad-status | 174869 + cdx-error | 159805 + wayback-error | 18076 + wrong-mimetype | 11169 + link-loop | 8410 + gateway-timeout | 4034 + spn2-cdx-lookup-failure | 510 + petabox-error | 339 + null-body | 251 + spn2-error | 19 + spn2-error:job-failed | 14 + bad-gzip-encoding | 13 + timeout | 5 + spn2-error:soft-time-limit-exceeded | 4 + invalid-host-resolution | 2 + spn2-error:pending | 1 + (20 rows) + +Top domains/statuses (including success): + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + AND created >= '2020-04-07' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 30; + + domain | status | count + ---------------------------------------+---------------------+-------- + ssl.fao.org | no-pdf-link | 862277 + www.e-periodica.ch | no-pdf-link | 746781 + www.researchgate.net | redirect-loop | 664524 + dlc.library.columbia.edu | no-pdf-link | 493111 + www.die-bonn.de | redirect-loop | 352903 + figshare.com | no-pdf-link | 319709 + statisticaldatasets.data-planet.com | no-pdf-link | 309584 + catalog.paradisec.org.au | redirect-loop | 225396 + zenodo.org | no-capture | 193201 + digi.ub.uni-heidelberg.de | no-pdf-link | 184974 + open.library.ubc.ca | no-pdf-link | 167841 + zenodo.org | no-pdf-link | 130617 + www.google.com | no-pdf-link | 111312 + www.e-manuscripta.ch | no-pdf-link | 79192 + ds.iris.edu | no-pdf-link | 77649 + data.inra.fr | no-pdf-link | 69440 + www.tib.eu | no-pdf-link | 63872 + www.egms.de | redirect-loop | 53877 + archaeologydataservice.ac.uk | redirect-loop | 52838 + d.lib.msu.edu | no-pdf-link | 45297 + www.e-rara.ch | no-pdf-link | 45163 + springernature.figshare.com | no-pdf-link | 42527 + boris.unibe.ch | no-pdf-link | 40816 + www.research-collection.ethz.ch | no-capture | 40350 + spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 33059 + repository.dri.ie | terminal-bad-status | 32760 + othes.univie.ac.at | no-pdf-link | 32558 + repositories.lib.utexas.edu | no-capture | 31526 + posterng.netkey.at | no-pdf-link | 30315 + zenodo.org | terminal-bad-status | 29614 + (30 rows) + -- cgit v1.2.3