From 1b74e8a4dee21bd260040dad8072e4fb48456b3c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 21:01:25 -0800 Subject: recent sandcrawler-db / ingest stats (interesting) --- sql/stats/2020-02-24_stats.txt | 482 +++++++++++++++++++++++++++++++++++++++++ sql/stats/README.md | 6 + 2 files changed, 488 insertions(+) create mode 100644 sql/stats/2020-02-24_stats.txt diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt new file mode 100644 index 0000000..e7a00e8 --- /dev/null +++ b/sql/stats/2020-02-24_stats.txt @@ -0,0 +1,482 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + Size: 271.83G + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB + "public"."grobid" | 47 GB | 7213 MB | 54 GB + "public"."file_meta" | 26 GB | 12 GB | 38 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB + "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."pdftrio" | 78 MB | 64 MB | 142 MB + (10 rows) + + +## File Metadata + +(skipping, no update) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 15,632,810 | 76,555,791 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 70656028 + 500 | 5896836 + -4 | 2295 + 503 | 111 + (4 rows) + + What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 56001631 + | 14654496 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2,868,825 | 2,887,834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591633 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + pdf | pubmed | 2 + (6 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | | 2943896 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | | 629719 + pdf | doi | fatcat-changelog | 129932 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | arxiv | fatcat-ingest | 998 + pdf | spn | | 64 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | spn | savepapernow-web | 18 + pdf | pubmed | | 2 + pdf | doi | savepapernow-web | 1 + (14 rows) + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + => UPDATE 2943896 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + => UPDATE 64 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + => UPDATE 629719 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + => UPDATE 1454 + + SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed'; + DELETE FROM ingest_request WHERE link_source = 'pubmed'; + => DELETE 2 + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591637 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + (5 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | fatcat-changelog | 3073828 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630717 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | fatcat-ingest | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | spn | savepapernow-web | 82 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 1 + (10 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + none? + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+---------+---------- + pdf | doi | 6591637 | 1622702 | 0.246 + pdf | pmc | 2030279 | 1241836 | 0.612 + pdf | arxiv | 630743 | 500620 | 0.794 + pdf | unpaywall | 1400 | 851 | 0.608 + pdf | spn | 82 | 62 | 0.756 + (5 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------------+--------- + pdf | success | 3366189 + pdf | no-pdf-link | 2902620 + pdf | no-capture | 1672025 + pdf | redirect-loop | 388844 + pdf | cdx-error | 272780 + pdf | terminal-bad-status | 171878 + pdf | spn-remote-error | 163843 + pdf | spn-error | 108070 + pdf | null-body | 66778 + pdf | link-loop | 43403 + pdf | skip-url-blocklist | 34705 + pdf | wrong-mimetype | 31343 + pdf | wayback-error | 13012 + pdf | spn2-cdx-lookup-failure | 6100 + pdf | gateway-timeout | 5633 + pdf | other-mimetype | 5114 + pdf | spn2-error:proxy-error | 538 + pdf | spn2-error:job-failed | 470 + pdf | petabox-error | 415 + pdf | spn2-error:browser-running-error | 136 + pdf | spn2-error | 127 + pdf | spn2-error:soft-time-limit-exceeded | 71 + pdf | bad-redirect | 39 + pdf | spn2-error:unknown | 30 + pdf | spn2-error:browsing-timeout | 25 + pdf | pending | 3 + pdf | invalid-host-resolution | 1 + (27 rows) + + +## Fatcat Files + +(skipping, no update) + +## Recent Success/Failure of Ingest by Domain + +NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a +small fraction of this is from changelog. + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + domain | status | count + -------------------------+----------------+------- + linkinghub.elsevier.com | no-capture | 2579 + www.mdpi.com | wrong-mimetype | 1313 + onlinelibrary.wiley.com | no-pdf-link | 785 + americanarchivist.org | no-pdf-link | 756 + journals.sagepub.com | redirect-loop | 503 + link.springer.com | redirect-loop | 432 + iopscience.iop.org | no-capture | 392 + www.tandfonline.com | no-pdf-link | 389 + pubs.rsc.org | no-capture | 361 + www.persee.fr | no-capture | 344 + (10 rows) + + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | success | 40578 + pdf | cdx-error | 14982 + pdf | no-capture | 7747 + pdf | no-pdf-link | 7111 + pdf | redirect-loop | 3265 + pdf | wrong-mimetype | 1629 + pdf | spn2-cdx-lookup-failure | 657 + pdf | link-loop | 538 + pdf | null-body | 517 + pdf | terminal-bad-status | 400 + pdf | wayback-error | 79 + pdf | spn2-error:job-failed | 53 + pdf | gateway-timeout | 38 + pdf | spn2-error:soft-time-limit-exceeded | 7 + pdf | spn2-error | 6 + pdf | petabox-error | 5 + pdf | spn2-error:browsing-timeout | 4 + pdf | spn2-error:unknown | 2 + pdf | bad-redirect | 1 + pdf | pending | 1 + (20 rows) + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + + ingest_type | date | total | success + -------------+------------+---------+--------- + pdf | 2020-02-25 | 32660 | 14322 + pdf | 2020-02-24 | 44967 | 26263 + pdf | 2020-02-23 | 58795 | 18874 + pdf | 2020-02-22 | 844249 | 272606 + pdf | 2020-02-21 | 1287378 | 433487 + pdf | 2020-02-20 | 1455943 | 492408 + pdf | 2020-02-19 | 21453 | 7529 + pdf | 2020-02-18 | 5863 | 2926 + pdf | 2020-02-17 | 3737 | 970 + pdf | 2020-02-16 | 13779 | 4862 + pdf | 2020-02-15 | 1021020 | 623020 + pdf | 2020-02-14 | 1036036 | 632830 + pdf | 2020-02-13 | 13503 | 5824 + pdf | 2020-02-12 | 20078 | 11422 + pdf | 2020-02-11 | 13499 | 6781 + pdf | 2020-02-10 | 2275 | 961 + pdf | 2020-02-09 | 3231 | 1494 + pdf | 2020-02-08 | 8967 | 4400 + pdf | 2020-02-07 | 7022 | 2430 + pdf | 2020-02-06 | 1291 | 516 + pdf | 2020-02-05 | 8586 | 6596 + pdf | 2020-02-04 | 3681 | 3593 + pdf | 2020-02-03 | 284 | 284 + pdf | 2020-02-02 | 480 | 480 + pdf | 2020-02-01 | 489 | 336 + pdf | 2020-01-31 | 1187 | 1130 + pdf | 2020-01-30 | 1613 | 1288 + pdf | 2020-01-29 | 947 | 279 + pdf | 2020-01-28 | 667 | 323 + (29 rows) + +Top "no-capture" domains (will need to re-ingest using live tool): + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------+------------+-------- + linkinghub.elsevier.com | no-capture | 320065 + iopscience.iop.org | no-capture | 46858 + pubs.rsc.org | no-capture | 43331 + www.persee.fr | no-capture | 38971 + www.doiserbia.nb.rs | no-capture | 27112 + academic.oup.com | no-capture | 18877 + www.osapublishing.org | no-capture | 17113 + osf.io | no-capture | 16978 + scripts.iucr.org | no-capture | 14844 + www.degruyter.com | no-capture | 8093 + mab-online.nl | no-capture | 6603 + insights.ovid.com | no-capture | 6457 + ir.lib.uth.gr | no-capture | 3625 + www.sciencedirect.com | no-capture | 3244 + www.tandfonline.com | no-capture | 3201 + www.ccsenet.org | no-capture | 2849 + www.intechopen.com | no-capture | 2813 + primary-hospital-care.ch | no-capture | 2774 + www.nature.com | no-capture | 2484 + www.indianjournals.com | no-capture | 2432 + journals.aps.org | no-capture | 2197 + journals.sagepub.com | no-capture | 2064 + www.episodes.org | no-capture | 1805 + periodicos.uninove.br | no-capture | 1692 + escholarship.org | no-capture | 1666 + (25 rows) + +Top "no-pdf-link" domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-pdf-link' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -----------------------------+-------------+-------- + plutof.ut.ee | no-pdf-link | 685315 + www.gbif.org | no-pdf-link | 670647 + doi.pangaea.de | no-pdf-link | 301984 + www.plate-archive.org | no-pdf-link | 209218 + onlinelibrary.wiley.com | no-pdf-link | 84890 + figshare.com | no-pdf-link | 72892 + zenodo.org | no-pdf-link | 45768 + www.tandfonline.com | no-pdf-link | 43848 + data.mendeley.com | no-pdf-link | 42367 + springernature.figshare.com | no-pdf-link | 35941 + dhz.uni-passau.de | no-pdf-link | 29187 + www.frontiersin.org | no-pdf-link | 17925 + digital.ucd.ie | no-pdf-link | 16769 + mr.crossref.org | no-pdf-link | 14999 + journals.lww.com | no-pdf-link | 12122 + musewide.aip.de | no-pdf-link | 10854 + datadryad.org | no-pdf-link | 10686 + www.jstor.org | no-pdf-link | 9159 + koreascience.or.kr | no-pdf-link | 9067 + easy.dans.knaw.nl | no-pdf-link | 8264 + scielo.conicyt.cl | no-pdf-link | 8069 + www.degruyter.com | no-pdf-link | 7989 + www.kci.go.kr | no-pdf-link | 6990 + www.m-hikari.com | no-pdf-link | 6941 + cshprotocols.cshlp.org | no-pdf-link | 6553 + (25 rows) + +Top block-ish domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status') + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + ---------------------------------+---------------------+------- + journals.openedition.org | redirect-loop | 30395 + ieeexplore.ieee.org | redirect-loop | 28926 + www.degruyter.com | redirect-loop | 18891 + www.cairn.info | link-loop | 8919 + www.frontiersin.org | terminal-bad-status | 6786 + projecteuclid.org | link-loop | 6098 + www.mdpi.com | terminal-bad-status | 5189 + medicalforum.ch | terminal-bad-status | 4596 + jrnl.nau.edu.ua | link-loop | 4238 + www.revistas.unam.mx | link-loop | 3926 + journals.aps.org | redirect-loop | 3696 + www.ijcseonline.org | redirect-loop | 3567 + www.researchsquare.com | terminal-bad-status | 3453 + www.persee.fr | terminal-bad-status | 3221 + www.baltistica.lt | link-loop | 2098 + osf.io | redirect-loop | 2004 + seer.ufrgs.br | terminal-bad-status | 2002 + jtd.amegroups.com | link-loop | 1738 + www.hindawi.com | terminal-bad-status | 1613 + linkinghub.elsevier.com | redirect-loop | 1612 + www.scienceopen.com | terminal-bad-status | 1580 + atm.amegroups.com | link-loop | 1571 + scielo.conicyt.cl | terminal-bad-status | 1491 + repozytorium.ur.edu.pl | redirect-loop | 1279 + agupubs.onlinelibrary.wiley.com | link-loop | 1182 + (25 rows) + diff --git a/sql/stats/README.md b/sql/stats/README.md index ea61fa0..52642f6 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -41,6 +41,10 @@ Total and unique-by-sha1 counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + Processed or not: # TODO: @@ -71,6 +75,8 @@ Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + Uncrawled requests by source: # TODO: verify this? -- cgit v1.2.3