diff options
Diffstat (limited to 'sql/stats')
-rw-r--r-- | sql/stats/2020-01-13_stats.txt | 190 | ||||
-rw-r--r-- | sql/stats/2020-01-31_supplement.txt | 42 | ||||
-rw-r--r-- | sql/stats/2020-02-24_stats.txt | 482 | ||||
-rw-r--r-- | sql/stats/2020-05-03_stats.txt | 418 | ||||
-rw-r--r-- | sql/stats/2020-07-23_stats.txt | 347 | ||||
-rw-r--r-- | sql/stats/2020-09-14_stats.txt | 340 | ||||
-rw-r--r-- | sql/stats/2021-04-07_stats.txt | 430 | ||||
-rw-r--r-- | sql/stats/2021-04-08_table_sizes.txt | 40 | ||||
-rw-r--r-- | sql/stats/README.md | 120 |
9 files changed, 2409 insertions, 0 deletions
diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt new file mode 100644 index 0000000..444e448 --- /dev/null +++ b/sql/stats/2020-01-13_stats.txt @@ -0,0 +1,190 @@ + +## SQL Table Sizes + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid" | 38 GB | 7076 MB | 45 GB + "public"."file_meta" | 23 GB | 11 GB | 34 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."ingest_request" | 363 MB | 625 MB | 988 MB + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + + total_count | total_size + -------------+----------------- + 118823340 | 140917467253923 + (1 row) + + # 118,823,340 => 118 million + # 140,917,467,253,923 => ~141 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 117185567 + | 1509149 + application/octet-stream | 87783 + text/html | 9901 + application/postscript | 3781 + application/vnd.ms-powerpoint | 1421 + text/plain | 1151 + application/xml | 427 + application/gzip | 414 + application/msword | 314 + (10 rows) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96141851 | 110030179 + (1 row) + + # 96,141,851 + # 110,030,179 + +Top mimetypes (not unique by sha1): + + mimetype | count + ------------------------+---------- + application/pdf | 84582642 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 13675190 | 59919772 + + # 13,675,190 + # 59,919,772 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 57382904 + 500 | 2536862 + 503 | 6 + (3 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 41699385 + | 15683279 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + # 2,868,825 + # 2,887,834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 2816171 + pdf | arxiv | 154448 + pdf | spn | 55 + pdf | pubmed | 2 + (4 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+------- + (0 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+--------+---------- + pdf | doi | 2816171 | 289199 | 0.103 + pdf | arxiv | 154448 | 41105 | 0.266 + pdf | spn | 55 | 46 | 0.836 + pdf | pubmed | 2 | 0 | 0.000 + (4 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+---------------------+--------- + pdf | no-pdf-link | 2213720 + pdf | success | 330492 + pdf | spn-remote-error | 182157 + pdf | spn-error | 141222 + pdf | cdx-error | 83131 + pdf | link-loop | 11350 + pdf | other-mimetype | 6089 + pdf | null-body | 1980 + pdf | terminal-bad-status | 583 + pdf | wayback-error | 381 + (10 rows) + diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt new file mode 100644 index 0000000..6bd43ea --- /dev/null +++ b/sql/stats/2020-01-31_supplement.txt @@ -0,0 +1,42 @@ + +How many file_meta still missing core metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 1,130,915 + +Great! Not many. + +And are in petabox? + + SELECT COUNT(*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL; + => 1,149,194 + +Almost all; maybe just some CDX fetch failures or something in there. So, +should run these on, eg, grobid2-vm. + + COPY ( + SELECT row_to_json(petabox.*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL + ) TO '/grande/snapshots/dump_grobid_petabox_todo.json'; + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a +couple million files): + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | count + -------------+--------- + 5072452 | 4130405 + diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt new file mode 100644 index 0000000..e7a00e8 --- /dev/null +++ b/sql/stats/2020-02-24_stats.txt @@ -0,0 +1,482 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + Size: 271.83G + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB + "public"."grobid" | 47 GB | 7213 MB | 54 GB + "public"."file_meta" | 26 GB | 12 GB | 38 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB + "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."pdftrio" | 78 MB | 64 MB | 142 MB + (10 rows) + + +## File Metadata + +(skipping, no update) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 15,632,810 | 76,555,791 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 70656028 + 500 | 5896836 + -4 | 2295 + 503 | 111 + (4 rows) + + What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 56001631 + | 14654496 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2,868,825 | 2,887,834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591633 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + pdf | pubmed | 2 + (6 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | | 2943896 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | | 629719 + pdf | doi | fatcat-changelog | 129932 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | arxiv | fatcat-ingest | 998 + pdf | spn | | 64 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | spn | savepapernow-web | 18 + pdf | pubmed | | 2 + pdf | doi | savepapernow-web | 1 + (14 rows) + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + => UPDATE 2943896 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + => UPDATE 64 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + => UPDATE 629719 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + => UPDATE 1454 + + SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed'; + DELETE FROM ingest_request WHERE link_source = 'pubmed'; + => DELETE 2 + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591637 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + (5 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | fatcat-changelog | 3073828 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630717 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | fatcat-ingest | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | spn | savepapernow-web | 82 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 1 + (10 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + none? + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+---------+---------- + pdf | doi | 6591637 | 1622702 | 0.246 + pdf | pmc | 2030279 | 1241836 | 0.612 + pdf | arxiv | 630743 | 500620 | 0.794 + pdf | unpaywall | 1400 | 851 | 0.608 + pdf | spn | 82 | 62 | 0.756 + (5 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------------+--------- + pdf | success | 3366189 + pdf | no-pdf-link | 2902620 + pdf | no-capture | 1672025 + pdf | redirect-loop | 388844 + pdf | cdx-error | 272780 + pdf | terminal-bad-status | 171878 + pdf | spn-remote-error | 163843 + pdf | spn-error | 108070 + pdf | null-body | 66778 + pdf | link-loop | 43403 + pdf | skip-url-blocklist | 34705 + pdf | wrong-mimetype | 31343 + pdf | wayback-error | 13012 + pdf | spn2-cdx-lookup-failure | 6100 + pdf | gateway-timeout | 5633 + pdf | other-mimetype | 5114 + pdf | spn2-error:proxy-error | 538 + pdf | spn2-error:job-failed | 470 + pdf | petabox-error | 415 + pdf | spn2-error:browser-running-error | 136 + pdf | spn2-error | 127 + pdf | spn2-error:soft-time-limit-exceeded | 71 + pdf | bad-redirect | 39 + pdf | spn2-error:unknown | 30 + pdf | spn2-error:browsing-timeout | 25 + pdf | pending | 3 + pdf | invalid-host-resolution | 1 + (27 rows) + + +## Fatcat Files + +(skipping, no update) + +## Recent Success/Failure of Ingest by Domain + +NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a +small fraction of this is from changelog. + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + domain | status | count + -------------------------+----------------+------- + linkinghub.elsevier.com | no-capture | 2579 + www.mdpi.com | wrong-mimetype | 1313 + onlinelibrary.wiley.com | no-pdf-link | 785 + americanarchivist.org | no-pdf-link | 756 + journals.sagepub.com | redirect-loop | 503 + link.springer.com | redirect-loop | 432 + iopscience.iop.org | no-capture | 392 + www.tandfonline.com | no-pdf-link | 389 + pubs.rsc.org | no-capture | 361 + www.persee.fr | no-capture | 344 + (10 rows) + + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | success | 40578 + pdf | cdx-error | 14982 + pdf | no-capture | 7747 + pdf | no-pdf-link | 7111 + pdf | redirect-loop | 3265 + pdf | wrong-mimetype | 1629 + pdf | spn2-cdx-lookup-failure | 657 + pdf | link-loop | 538 + pdf | null-body | 517 + pdf | terminal-bad-status | 400 + pdf | wayback-error | 79 + pdf | spn2-error:job-failed | 53 + pdf | gateway-timeout | 38 + pdf | spn2-error:soft-time-limit-exceeded | 7 + pdf | spn2-error | 6 + pdf | petabox-error | 5 + pdf | spn2-error:browsing-timeout | 4 + pdf | spn2-error:unknown | 2 + pdf | bad-redirect | 1 + pdf | pending | 1 + (20 rows) + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + + ingest_type | date | total | success + -------------+------------+---------+--------- + pdf | 2020-02-25 | 32660 | 14322 + pdf | 2020-02-24 | 44967 | 26263 + pdf | 2020-02-23 | 58795 | 18874 + pdf | 2020-02-22 | 844249 | 272606 + pdf | 2020-02-21 | 1287378 | 433487 + pdf | 2020-02-20 | 1455943 | 492408 + pdf | 2020-02-19 | 21453 | 7529 + pdf | 2020-02-18 | 5863 | 2926 + pdf | 2020-02-17 | 3737 | 970 + pdf | 2020-02-16 | 13779 | 4862 + pdf | 2020-02-15 | 1021020 | 623020 + pdf | 2020-02-14 | 1036036 | 632830 + pdf | 2020-02-13 | 13503 | 5824 + pdf | 2020-02-12 | 20078 | 11422 + pdf | 2020-02-11 | 13499 | 6781 + pdf | 2020-02-10 | 2275 | 961 + pdf | 2020-02-09 | 3231 | 1494 + pdf | 2020-02-08 | 8967 | 4400 + pdf | 2020-02-07 | 7022 | 2430 + pdf | 2020-02-06 | 1291 | 516 + pdf | 2020-02-05 | 8586 | 6596 + pdf | 2020-02-04 | 3681 | 3593 + pdf | 2020-02-03 | 284 | 284 + pdf | 2020-02-02 | 480 | 480 + pdf | 2020-02-01 | 489 | 336 + pdf | 2020-01-31 | 1187 | 1130 + pdf | 2020-01-30 | 1613 | 1288 + pdf | 2020-01-29 | 947 | 279 + pdf | 2020-01-28 | 667 | 323 + (29 rows) + +Top "no-capture" domains (will need to re-ingest using live tool): + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------+------------+-------- + linkinghub.elsevier.com | no-capture | 320065 + iopscience.iop.org | no-capture | 46858 + pubs.rsc.org | no-capture | 43331 + www.persee.fr | no-capture | 38971 + www.doiserbia.nb.rs | no-capture | 27112 + academic.oup.com | no-capture | 18877 + www.osapublishing.org | no-capture | 17113 + osf.io | no-capture | 16978 + scripts.iucr.org | no-capture | 14844 + www.degruyter.com | no-capture | 8093 + mab-online.nl | no-capture | 6603 + insights.ovid.com | no-capture | 6457 + ir.lib.uth.gr | no-capture | 3625 + www.sciencedirect.com | no-capture | 3244 + www.tandfonline.com | no-capture | 3201 + www.ccsenet.org | no-capture | 2849 + www.intechopen.com | no-capture | 2813 + primary-hospital-care.ch | no-capture | 2774 + www.nature.com | no-capture | 2484 + www.indianjournals.com | no-capture | 2432 + journals.aps.org | no-capture | 2197 + journals.sagepub.com | no-capture | 2064 + www.episodes.org | no-capture | 1805 + periodicos.uninove.br | no-capture | 1692 + escholarship.org | no-capture | 1666 + (25 rows) + +Top "no-pdf-link" domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-pdf-link' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -----------------------------+-------------+-------- + plutof.ut.ee | no-pdf-link | 685315 + www.gbif.org | no-pdf-link | 670647 + doi.pangaea.de | no-pdf-link | 301984 + www.plate-archive.org | no-pdf-link | 209218 + onlinelibrary.wiley.com | no-pdf-link | 84890 + figshare.com | no-pdf-link | 72892 + zenodo.org | no-pdf-link | 45768 + www.tandfonline.com | no-pdf-link | 43848 + data.mendeley.com | no-pdf-link | 42367 + springernature.figshare.com | no-pdf-link | 35941 + dhz.uni-passau.de | no-pdf-link | 29187 + www.frontiersin.org | no-pdf-link | 17925 + digital.ucd.ie | no-pdf-link | 16769 + mr.crossref.org | no-pdf-link | 14999 + journals.lww.com | no-pdf-link | 12122 + musewide.aip.de | no-pdf-link | 10854 + datadryad.org | no-pdf-link | 10686 + www.jstor.org | no-pdf-link | 9159 + koreascience.or.kr | no-pdf-link | 9067 + easy.dans.knaw.nl | no-pdf-link | 8264 + scielo.conicyt.cl | no-pdf-link | 8069 + www.degruyter.com | no-pdf-link | 7989 + www.kci.go.kr | no-pdf-link | 6990 + www.m-hikari.com | no-pdf-link | 6941 + cshprotocols.cshlp.org | no-pdf-link | 6553 + (25 rows) + +Top block-ish domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status') + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + ---------------------------------+---------------------+------- + journals.openedition.org | redirect-loop | 30395 + ieeexplore.ieee.org | redirect-loop | 28926 + www.degruyter.com | redirect-loop | 18891 + www.cairn.info | link-loop | 8919 + www.frontiersin.org | terminal-bad-status | 6786 + projecteuclid.org | link-loop | 6098 + www.mdpi.com | terminal-bad-status | 5189 + medicalforum.ch | terminal-bad-status | 4596 + jrnl.nau.edu.ua | link-loop | 4238 + www.revistas.unam.mx | link-loop | 3926 + journals.aps.org | redirect-loop | 3696 + www.ijcseonline.org | redirect-loop | 3567 + www.researchsquare.com | terminal-bad-status | 3453 + www.persee.fr | terminal-bad-status | 3221 + www.baltistica.lt | link-loop | 2098 + osf.io | redirect-loop | 2004 + seer.ufrgs.br | terminal-bad-status | 2002 + jtd.amegroups.com | link-loop | 1738 + www.hindawi.com | terminal-bad-status | 1613 + linkinghub.elsevier.com | redirect-loop | 1612 + www.scienceopen.com | terminal-bad-status | 1580 + atm.amegroups.com | link-loop | 1571 + scielo.conicyt.cl | terminal-bad-status | 1491 + repozytorium.ur.edu.pl | redirect-loop | 1279 + agupubs.onlinelibrary.wiley.com | link-loop | 1182 + (25 rows) + diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt new file mode 100644 index 0000000..55f0c1e --- /dev/null +++ b/sql/stats/2020-05-03_stats.txt @@ -0,0 +1,418 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 41 GB | 82 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 59 GB | 7604 MB | 66 GB + "public"."file_meta" | 31 GB | 28 GB | 59 GB + "public"."ingest_request" | 19 GB | 20 GB | 39 GB + "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (10 rows) + + Size: 383.93G + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 158059828 | 197346217653010 + (1 row) + + => 158 million, 197 terabytes + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 157805029 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6040 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1027125 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + + unique_sha1 | total + -------------+----------- + 92936564 | 111022039 + (1 row) + + => 110 million rows, 92.9 million files + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------------------------------------+----------- + application/pdf | 104178718 + warc/revisit | 5274410 + text/xml | 519042 + text/html | 295523 + application/octet-stream | 259681 + unk | 138930 + application/postscript | 81065 + application/save | 80765 + binary/octet-stream | 59804 + application/x-download | 27083 + text/plain | 26938 + application/download | 25125 + image/pdf | 16095 + application/force-download | 9004 + application/x-msdownload | 3711 + application | 2934 + application/x-octetstream | 2926 + multipart/form-data | 2741 + application/x-pdf | 2444 + .pdf | 2368 + application/binary | 1268 + application/pdf' | 1192 + pdf | 1113 + file/unknown | 1086 + application/unknown | 761 + file | 753 + application/blob | 670 + application/octetstream | 657 + text/pdf | 549 + 0 | 417 + ('application/pdf', | 349 + application/http;msgtype=response | 251 + application/doc | 180 + [...] (wasn't LIMIT 25) + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + + unique_releases | total + -----------------+---------- + 17455441 | 92707544 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057023 + | 14638425 + (2 rows) + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057074 + | 3 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | unpaywall | 26244088 + pdf | mag | 25596658 + pdf | doi | 15652966 + pdf | pmc | 2043646 + pdf | arxiv | 721902 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 103 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | unpaywall | unpaywall | 26244088 + pdf | mag | mag-corpus | 25596658 + pdf | doi | fatcat-ingest | 8267308 + pdf | doi | fatcat-changelog | 3869772 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630719 + pdf | arxiv | fatcat-changelog | 91157 + pdf | pmc | fatcat-ingest | 10195 + pdf | pmc | fatcat-changelog | 4626 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 103 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 15 + (15 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+------- + pdf | mag | 47 + pdf | unpaywall | 1 + (2 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | unpaywall | 26244088 | 19968092 | 0.761 + pdf | mag | 25596658 | 18712912 | 0.731 + pdf | doi | 15653166 | 2878833 | 0.184 + pdf | pmc | 2043646 | 1279529 | 0.626 + pdf | arxiv | 721902 | 592394 | 0.821 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 103 | 82 | 0.796 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 37449502 + pdf | no-pdf-link | 10908442 + pdf | no-capture | 5643670 + pdf | redirect-loop | 4823502 + pdf | terminal-bad-status | 1715056 + pdf | link-loop | 1425072 + pdf | cdx-error | 535365 + pdf | gateway-timeout | 267654 + pdf | skip-url-blocklist | 220433 + pdf | wrong-mimetype | 189804 + pdf | spn2-cdx-lookup-failure | 103926 + pdf | spn-error | 101777 + pdf | wayback-error | 93517 + pdf | null-body | 87279 + pdf | invalid-host-resolution | 35305 + pdf | spn-remote-error | 28888 + pdf | petabox-error | 12406 + pdf | spn2-error | 2905 + pdf | spn2-error:job-failed | 2307 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 745 + pdf | spn2-error:proxy-error | 438 + pdf | spn2-error:invalid-url-syntax | 406 + pdf | spn2-error:soft-time-limit-exceeded | 405 + pdf | spn2-error:browser-running-error | 274 + (25 rows) + +Failures by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | domain | status | count + -------------+---------------------------------------+---------------------+-------- + pdf | ssl.fao.org | no-pdf-link | 862277 + pdf | www.researchgate.net | redirect-loop | 749094 + pdf | www.e-periodica.ch | no-pdf-link | 747370 + pdf | ieeexplore.ieee.org | redirect-loop | 707482 + pdf | plutof.ut.ee | no-pdf-link | 685341 + pdf | www.gbif.org | no-pdf-link | 670905 + pdf | dlc.library.columbia.edu | no-pdf-link | 508281 + pdf | figshare.com | no-pdf-link | 400501 + pdf | onlinelibrary.wiley.com | no-pdf-link | 399187 + pdf | watermark.silverchair.com | terminal-bad-status | 357188 + pdf | www.die-bonn.de | redirect-loop | 352903 + pdf | academic.oup.com | no-pdf-link | 346828 + pdf | iopscience.iop.org | terminal-bad-status | 345147 + pdf | linkinghub.elsevier.com | no-capture | 328434 + pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206 + pdf | cyberleninka.ru | link-loop | 309525 + pdf | www.tandfonline.com | no-pdf-link | 309146 + pdf | dialnet.unirioja.es | terminal-bad-status | 307572 + pdf | doi.pangaea.de | no-pdf-link | 304924 + pdf | journals.sagepub.com | no-pdf-link | 285774 + pdf | papers.ssrn.com | link-loop | 282415 + pdf | dialnet.unirioja.es | redirect-loop | 274476 + pdf | ieeexplore.ieee.org | link-loop | 273607 + pdf | catalog.paradisec.org.au | redirect-loop | 234653 + pdf | www.plate-archive.org | no-pdf-link | 209217 + pdf | zenodo.org | no-pdf-link | 200078 + pdf | zenodo.org | no-capture | 199025 + pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084 + pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039 + pdf | validate.perfdrive.com | no-pdf-link | 180191 + (30 rows) + +Success by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | domain | status | count + -------------+----------------------------+---------+--------- + pdf | www.jstage.jst.go.jp | success | 2244620 + pdf | europepmc.org | success | 1284770 + pdf | link.springer.com | success | 1017998 + pdf | www.scielo.br | success | 799577 + pdf | arxiv.org | success | 592622 + pdf | downloads.hindawi.com | success | 527278 + pdf | res.mdpi.com | success | 501093 + pdf | hal.archives-ouvertes.fr | success | 447877 + pdf | digital.library.unt.edu | success | 404460 + pdf | www.cambridge.org | success | 394666 + pdf | dergipark.org.tr | success | 373706 + pdf | journals.plos.org | success | 296994 + pdf | watermark.silverchair.com | success | 275562 + pdf | www.nature.com | success | 263836 + pdf | cds.cern.ch | success | 223057 + pdf | www.pnas.org | success | 220488 + pdf | s3-eu-west-1.amazonaws.com | success | 214558 + pdf | www.jbc.org | success | 205277 + pdf | www.redalyc.org | success | 193591 + pdf | iopscience.iop.org | success | 175796 + pdf | apps.dtic.mil | success | 170589 + pdf | zenodo.org | success | 167812 + pdf | peerj.com | success | 155620 + pdf | www.biorxiv.org | success | 149337 + pdf | 210.101.116.28 | success | 145706 + pdf | www.teses.usp.br | success | 145438 + pdf | absimage.aps.org | success | 144400 + pdf | hrcak.srce.hr | success | 134669 + pdf | www.erudit.org | success | 131771 + pdf | babel.hathitrust.org | success | 130645 + (30 rows) + + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + => NOT RUN, fatcat_file table is way out of date + diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt new file mode 100644 index 0000000..d1993fc --- /dev/null +++ b/sql/stats/2020-07-23_stats.txt @@ -0,0 +1,347 @@ + +Summary: + +- very many more PDFs have been grobid-ed vs. pdf_meta-ed +- about 1 million file_meta still have partial metadata (eg, no sha256) +- database size still under 0.5 TByte +- there are about a million CDX error ingest requests, and hundreds of + thousands of SPN errors which could be re-run + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 42 GB | 84 GB + "public"."ingest_request" | 34 GB | 39 GB | 73 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 61 GB | 7742 MB | 69 GB + "public"."file_meta" | 32 GB | 29 GB | 61 GB + "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + Size: 466.91G + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 161944425 | 204,402,677,360,189 + (1 row) + + # 161.9 mil; 204 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 161691608 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1015337 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96537611 | 116281981 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------+----------- + application/pdf | 108706978 + warc/revisit | 5912013 + text/xml | 519042 + application/octet-stream | 307782 + text/html | 295634 + unk | 156937 + application/postscript | 81079 + application/save | 80871 + binary/octet-stream | 61263 + text/plain | 31495 + application/x-download | 30511 + application/download | 26716 + image/pdf | 26357 + application/force-download | 10541 + multipart/form-data | 5551 + application/x-msdownload | 3724 + application/x-octetstream | 3216 + application | 3171 + .pdf | 2728 + application/x-pdf | 2563 + application/binary | 1306 + application/pdf' | 1192 + pdf | 1180 + [...] + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + + total_files | unique_releases + -------------+----------------- + 95557413 | 18020570 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + + status_code | count + -------------+---------- + 200 | 88450610 + 500 | 7101098 + -4 | 4133 + 503 | 110 + + SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10; + + status | count + ----------------+---------- + success | 73814297 + | 14638412 + error | 7101308 + error-timeout | 4133 + bad-grobid-xml | 6 + (5 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 73813427 + | 14638425 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 16589669 + pdf | pmc | 2231113 + pdf | arxiv | 794693 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 148 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8320832 + pdf | doi | fatcat-changelog | 4752956 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630750 + pdf | pmc | fatcat-ingest | 194781 + pdf | arxiv | fatcat-changelog | 163924 + pdf | pmc | fatcat-changelog | 7507 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 148 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 19 + pdf | arxiv | savepapernow-web | 2 + +Uncrawled requests by source: + + # TODO: verify this? seems wrong + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | mag | 4097008 + pdf | oai | 15287 + pdf | unpaywall | 1 + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 5346057 | 0.104 + pdf | mag | 35015357 | 22199271 | 0.634 + pdf | unpaywall | 27653003 | 22067338 | 0.798 + pdf | doi | 16589700 | 3207661 | 0.193 + pdf | pmc | 2231113 | 1696976 | 0.761 + pdf | arxiv | 794727 | 645607 | 0.812 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 148 | 114 | 0.770 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 46465271 + pdf | no-capture | 46115869 + pdf | no-pdf-link | 13877460 + pdf | redirect-loop | 5943956 + pdf | terminal-bad-status | 1962754 + pdf | link-loop | 1630078 + pdf | cdx-error | 1014409 + pdf | gateway-timeout | 459340 + pdf | wrong-mimetype | 321774 + pdf | skip-url-blocklist | 220629 + pdf | wayback-error | 220453 + pdf | spn2-cdx-lookup-failure | 143963 + pdf | null-body | 113384 + pdf | spn-error | 101773 + pdf | invalid-host-resolution | 37367 + pdf | spn-remote-error | 28886 + pdf | petabox-error | 22997 + pdf | spn2-error | 16342 + pdf | spn2-error:job-failed | 5017 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 746 + pdf | spn2-error:soft-time-limit-exceeded | 632 + pdf | spn2-error:proxy-error | 437 + pdf | spn2-error:invalid-url-syntax | 417 + pdf | timeout | 417 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + + total_count | release_count + -------------+--------------- + 5862666 | 4728824 + (1 row) + +## PDF Meta + +Total rows: + + SELECT COUNT(*) as total_count FROM pdf_meta; + + + total_count + ------------- + 21961874 + +By status: + + SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC; + + status | count + ----------------+---------- + success | 21788507 + parse-error | 78196 + text-too-large | 60595 + not-pdf | 31679 + error-wayback | 2639 + bad-unicode | 251 + bad-pdf | 6 + empty-blob | 1 + (8 rows) + diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 44 GB | 45 GB | 89 GB + "public"."grobid" | 66 GB | 8127 MB | 74 GB + "public"."ingest_request" | 34 GB | 40 GB | 73 GB + "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."file_meta" | 33 GB | 30 GB | 63 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 167021210 | 221982345333674 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 166765214 + application/octet-stream | 155517 + application/xml | 42170 + text/html | 18708 + text/plain | 15990 + application/gzip | 6491 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62960 + (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 102123051 | 126550160 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ----------------------------+----------- + application/pdf | 116885565 + warc/revisit | 7951816 + text/xml | 519042 + application/octet-stream | 327639 + text/html | 295725 + unk | 172491 + application/postscript | 81095 + application/save | 80900 + binary/octet-stream | 61783 + text/plain | 33684 + image/pdf | 32856 + application/x-download | 32418 + application/download | 27672 + application/force-download | 10892 + multipart/form-data | 5750 + application/x-msdownload | 3832 + application/x-octetstream | 3516 + application | 3499 + .pdf | 3038 + application/x-pdf | 2701 + application/binary | 1322 + pdf | 1232 + file/unknown | 1199 + application/pdf' | 1192 + file | 979 + (25 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 101494314 | 18919012 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 93730358 + 500 | 7759103 + -4 | 4683 + 503 | 150 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 80838234 + | 12892145 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 17362763 + pdf | pmc | 2248854 + pdf | arxiv | 835400 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 197 + (9 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8399261 + pdf | doi | fatcat-changelog | 5449349 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 634665 + pdf | pmc | fatcat-ingest | 210453 + pdf | arxiv | fatcat-changelog | 200707 + pdf | pmc | fatcat-changelog | 9582 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 197 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 21 + pdf | arxiv | savepapernow-web | 2 + (17 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 170304 + pdf | oai | 15287 + pdf | unpaywall | 1 + (3 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14144314 | 0.276 + pdf | mag | 35015357 | 24811947 | 0.709 + pdf | unpaywall | 27653003 | 22302629 | 0.807 + pdf | doi | 17363369 | 3533568 | 0.204 + pdf | pmc | 2248860 | 1713197 | 0.762 + pdf | arxiv | 835400 | 685219 | 0.820 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 197 | 138 | 0.701 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 58265365 + pdf | no-pdf-link | 27216435 + pdf | no-capture | 21982611 + pdf | redirect-loop | 8457469 + pdf | terminal-bad-status | 2695023 + pdf | link-loop | 2209672 + pdf | wrong-mimetype | 767508 + pdf | gateway-timeout | 548870 + pdf | cdx-error | 391611 + pdf | skip-url-blocklist | 220661 + pdf | null-body | 182215 + pdf | wayback-error | 146869 + pdf | spn2-cdx-lookup-failure | 107229 + pdf | spn-error | 85128 + pdf | invalid-host-resolution | 37352 + pdf | petabox-error | 32490 + pdf | spn2-error | 29212 + pdf | spn-remote-error | 27927 + pdf | other-mimetype | 2305 + pdf | bad-redirect | 1524 + pdf | spn2-error:job-failed | 1521 + pdf | timeout | 842 + pdf | spn2-error:soft-time-limit-exceeded | 793 + pdf | redirects-exceeded | 748 + pdf | spn2-error:invalid-url-syntax | 417 + (25 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) + FROM ingest_file_result + WHERE hit = false + GROUP BY ingest_type, terminal_status_code + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 34064937 + pdf | | 20514531 + pdf | 301 | 7271700 + pdf | 302 | 720632 + pdf | 503 | 712697 + pdf | 400 | 444209 + pdf | 404 | 331495 + pdf | 403 | 323030 + pdf | 401 | 259327 + pdf | 500 | 236122 + pdf | 303 | 101609 + pdf | 429 | 47738 + pdf | 502 | 36183 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14790 + pdf | 999 | 8996 + pdf | 307 | 3769 + pdf | 308 | 3422 + pdf | 202 | 3228 + pdf | 520 | 2058 + pdf | 410 | 1734 + pdf | 521 | 1033 + pdf | 504 | 868 + pdf | 505 | 424 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 6600758 | 5213294 + (1 row) + diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/README.md b/sql/stats/README.md new file mode 100644 index 0000000..62e213c --- /dev/null +++ b/sql/stats/README.md @@ -0,0 +1,120 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + |