aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats
diff options
context:
space:
mode:
Diffstat (limited to 'sql/stats')
-rw-r--r--sql/stats/2020-01-13_stats.txt190
-rw-r--r--sql/stats/2020-01-31_supplement.txt42
-rw-r--r--sql/stats/2020-02-24_stats.txt482
-rw-r--r--sql/stats/2020-05-03_stats.txt418
-rw-r--r--sql/stats/2020-07-23_stats.txt347
-rw-r--r--sql/stats/2020-09-14_stats.txt340
-rw-r--r--sql/stats/2021-04-07_stats.txt430
-rw-r--r--sql/stats/2021-04-08_table_sizes.txt40
-rw-r--r--sql/stats/README.md120
9 files changed, 2409 insertions, 0 deletions
diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt
new file mode 100644
index 0000000..444e448
--- /dev/null
+++ b/sql/stats/2020-01-13_stats.txt
@@ -0,0 +1,190 @@
+
+## SQL Table Sizes
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid" | 38 GB | 7076 MB | 45 GB
+ "public"."file_meta" | 23 GB | 11 GB | 34 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."ingest_request" | 363 MB | 625 MB | 988 MB
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+
+ total_count | total_size
+ -------------+-----------------
+ 118823340 | 140917467253923
+ (1 row)
+
+ # 118,823,340 => 118 million
+ # 140,917,467,253,923 => ~141 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 117185567
+ | 1509149
+ application/octet-stream | 87783
+ text/html | 9901
+ application/postscript | 3781
+ application/vnd.ms-powerpoint | 1421
+ text/plain | 1151
+ application/xml | 427
+ application/gzip | 414
+ application/msword | 314
+ (10 rows)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96141851 | 110030179
+ (1 row)
+
+ # 96,141,851
+ # 110,030,179
+
+Top mimetypes (not unique by sha1):
+
+ mimetype | count
+ ------------------------+----------
+ application/pdf | 84582642
+ text/html | 24841846
+ text/xml | 524682
+ application/postscript | 81009
+ (4 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 13675190 | 59919772
+
+ # 13,675,190
+ # 59,919,772
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 57382904
+ 500 | 2536862
+ 503 | 6
+ (3 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 41699385
+ | 15683279
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+ # 2,868,825
+ # 2,887,834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 2816171
+ pdf | arxiv | 154448
+ pdf | spn | 55
+ pdf | pubmed | 2
+ (4 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ (0 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+--------+----------
+ pdf | doi | 2816171 | 289199 | 0.103
+ pdf | arxiv | 154448 | 41105 | 0.266
+ pdf | spn | 55 | 46 | 0.836
+ pdf | pubmed | 2 | 0 | 0.000
+ (4 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+---------------------+---------
+ pdf | no-pdf-link | 2213720
+ pdf | success | 330492
+ pdf | spn-remote-error | 182157
+ pdf | spn-error | 141222
+ pdf | cdx-error | 83131
+ pdf | link-loop | 11350
+ pdf | other-mimetype | 6089
+ pdf | null-body | 1980
+ pdf | terminal-bad-status | 583
+ pdf | wayback-error | 381
+ (10 rows)
+
diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt
new file mode 100644
index 0000000..6bd43ea
--- /dev/null
+++ b/sql/stats/2020-01-31_supplement.txt
@@ -0,0 +1,42 @@
+
+How many file_meta still missing core metadata?
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+ => 1,130,915
+
+Great! Not many.
+
+And are in petabox?
+
+ SELECT COUNT(*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL;
+ => 1,149,194
+
+Almost all; maybe just some CDX fetch failures or something in there. So,
+should run these on, eg, grobid2-vm.
+
+ COPY (
+ SELECT row_to_json(petabox.*)
+ FROM file_meta
+ LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex
+ WHERE file_meta.sha256hex IS NULL
+ AND file_meta.sha1hex IS NOT NULL
+ ) TO '/grande/snapshots/dump_grobid_petabox_todo.json';
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a
+couple million files):
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | count
+ -------------+---------
+ 5072452 | 4130405
+
diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt
new file mode 100644
index 0000000..e7a00e8
--- /dev/null
+++ b/sql/stats/2020-02-24_stats.txt
@@ -0,0 +1,482 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ Size: 271.83G
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 36 GB | 78 GB
+ "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB
+ "public"."grobid" | 47 GB | 7213 MB | 54 GB
+ "public"."file_meta" | 26 GB | 12 GB | 38 GB
+ "public"."shadow" | 8303 MB | 9216 MB | 17 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB
+ "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ "public"."pdftrio" | 78 MB | 64 MB | 142 MB
+ (10 rows)
+
+
+## File Metadata
+
+(skipping, no update)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+ unique_releases | total
+ -----------------+----------
+ 15,632,810 | 76,555,791
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 70656028
+ 500 | 5896836
+ -4 | 2295
+ 503 | 111
+ (4 rows)
+
+ What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 56001631
+ | 14654496
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2,868,825 | 2,887,834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591633
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ pdf | pubmed | 2
+ (6 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | | 2943896
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | | 629719
+ pdf | doi | fatcat-changelog | 129932
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | arxiv | fatcat-ingest | 998
+ pdf | spn | | 64
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | spn | savepapernow-web | 18
+ pdf | pubmed | | 2
+ pdf | doi | savepapernow-web | 1
+ (14 rows)
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL;
+ => UPDATE 2943896
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL;
+ => UPDATE 64
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL;
+ => UPDATE 629719
+
+ SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL;
+ => UPDATE 1454
+
+ SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed';
+ DELETE FROM ingest_request WHERE link_source = 'pubmed';
+ => DELETE 2
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doi | 6591637
+ pdf | pmc | 2030279
+ pdf | arxiv | 630743
+ pdf | unpaywall | 1400
+ pdf | spn | 82
+ (5 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-------------+-------------------------+---------
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | doi | fatcat-changelog | 3073828
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630717
+ pdf | doi | fatcat-ingest | 1935
+ pdf | pmc | fatcat-ingest | 1454
+ pdf | unpaywall | unpaywall | 1400
+ pdf | spn | savepapernow-web | 82
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 1
+ (10 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ none?
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-------------+----------+---------+----------
+ pdf | doi | 6591637 | 1622702 | 0.246
+ pdf | pmc | 2030279 | 1241836 | 0.612
+ pdf | arxiv | 630743 | 500620 | 0.794
+ pdf | unpaywall | 1400 | 851 | 0.608
+ pdf | spn | 82 | 62 | 0.756
+ (5 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+---------
+ pdf | success | 3366189
+ pdf | no-pdf-link | 2902620
+ pdf | no-capture | 1672025
+ pdf | redirect-loop | 388844
+ pdf | cdx-error | 272780
+ pdf | terminal-bad-status | 171878
+ pdf | spn-remote-error | 163843
+ pdf | spn-error | 108070
+ pdf | null-body | 66778
+ pdf | link-loop | 43403
+ pdf | skip-url-blocklist | 34705
+ pdf | wrong-mimetype | 31343
+ pdf | wayback-error | 13012
+ pdf | spn2-cdx-lookup-failure | 6100
+ pdf | gateway-timeout | 5633
+ pdf | other-mimetype | 5114
+ pdf | spn2-error:proxy-error | 538
+ pdf | spn2-error:job-failed | 470
+ pdf | petabox-error | 415
+ pdf | spn2-error:browser-running-error | 136
+ pdf | spn2-error | 127
+ pdf | spn2-error:soft-time-limit-exceeded | 71
+ pdf | bad-redirect | 39
+ pdf | spn2-error:unknown | 30
+ pdf | spn2-error:browsing-timeout | 25
+ pdf | pending | 3
+ pdf | invalid-host-resolution | 1
+ (27 rows)
+
+
+## Fatcat Files
+
+(skipping, no update)
+
+## Recent Success/Failure of Ingest by Domain
+
+NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a
+small fraction of this is from changelog.
+
+ # "problem domains" and statuses
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ AND t1.updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 10;
+
+ domain | status | count
+ -------------------------+----------------+-------
+ linkinghub.elsevier.com | no-capture | 2579
+ www.mdpi.com | wrong-mimetype | 1313
+ onlinelibrary.wiley.com | no-pdf-link | 785
+ americanarchivist.org | no-pdf-link | 756
+ journals.sagepub.com | redirect-loop | 503
+ link.springer.com | redirect-loop | 432
+ iopscience.iop.org | no-capture | 392
+ www.tandfonline.com | no-pdf-link | 389
+ pubs.rsc.org | no-capture | 361
+ www.persee.fr | no-capture | 344
+ (10 rows)
+
+
+ # "what type of errors"
+ SELECT ingest_type, status, COUNT(*)
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 day'::INTERVAL
+ GROUP BY ingest_type, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+-------
+ pdf | success | 40578
+ pdf | cdx-error | 14982
+ pdf | no-capture | 7747
+ pdf | no-pdf-link | 7111
+ pdf | redirect-loop | 3265
+ pdf | wrong-mimetype | 1629
+ pdf | spn2-cdx-lookup-failure | 657
+ pdf | link-loop | 538
+ pdf | null-body | 517
+ pdf | terminal-bad-status | 400
+ pdf | wayback-error | 79
+ pdf | spn2-error:job-failed | 53
+ pdf | gateway-timeout | 38
+ pdf | spn2-error:soft-time-limit-exceeded | 7
+ pdf | spn2-error | 6
+ pdf | petabox-error | 5
+ pdf | spn2-error:browsing-timeout | 4
+ pdf | spn2-error:unknown | 2
+ pdf | bad-redirect | 1
+ pdf | pending | 1
+ (20 rows)
+
+ # "throughput per day for last N days"
+ SELECT ingest_type,
+ date(updated),
+ COUNT(*) as total,
+ COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success
+ FROM ingest_file_result
+ WHERE updated >= NOW() - '1 month'::INTERVAL
+ GROUP BY ingest_type, date(updated)
+ ORDER BY date(updated) DESC;
+
+ ingest_type | date | total | success
+ -------------+------------+---------+---------
+ pdf | 2020-02-25 | 32660 | 14322
+ pdf | 2020-02-24 | 44967 | 26263
+ pdf | 2020-02-23 | 58795 | 18874
+ pdf | 2020-02-22 | 844249 | 272606
+ pdf | 2020-02-21 | 1287378 | 433487
+ pdf | 2020-02-20 | 1455943 | 492408
+ pdf | 2020-02-19 | 21453 | 7529
+ pdf | 2020-02-18 | 5863 | 2926
+ pdf | 2020-02-17 | 3737 | 970
+ pdf | 2020-02-16 | 13779 | 4862
+ pdf | 2020-02-15 | 1021020 | 623020
+ pdf | 2020-02-14 | 1036036 | 632830
+ pdf | 2020-02-13 | 13503 | 5824
+ pdf | 2020-02-12 | 20078 | 11422
+ pdf | 2020-02-11 | 13499 | 6781
+ pdf | 2020-02-10 | 2275 | 961
+ pdf | 2020-02-09 | 3231 | 1494
+ pdf | 2020-02-08 | 8967 | 4400
+ pdf | 2020-02-07 | 7022 | 2430
+ pdf | 2020-02-06 | 1291 | 516
+ pdf | 2020-02-05 | 8586 | 6596
+ pdf | 2020-02-04 | 3681 | 3593
+ pdf | 2020-02-03 | 284 | 284
+ pdf | 2020-02-02 | 480 | 480
+ pdf | 2020-02-01 | 489 | 336
+ pdf | 2020-01-31 | 1187 | 1130
+ pdf | 2020-01-30 | 1613 | 1288
+ pdf | 2020-01-29 | 947 | 279
+ pdf | 2020-01-28 | 667 | 323
+ (29 rows)
+
+Top "no-capture" domains (will need to re-ingest using live tool):
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-capture'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ --------------------------+------------+--------
+ linkinghub.elsevier.com | no-capture | 320065
+ iopscience.iop.org | no-capture | 46858
+ pubs.rsc.org | no-capture | 43331
+ www.persee.fr | no-capture | 38971
+ www.doiserbia.nb.rs | no-capture | 27112
+ academic.oup.com | no-capture | 18877
+ www.osapublishing.org | no-capture | 17113
+ osf.io | no-capture | 16978
+ scripts.iucr.org | no-capture | 14844
+ www.degruyter.com | no-capture | 8093
+ mab-online.nl | no-capture | 6603
+ insights.ovid.com | no-capture | 6457
+ ir.lib.uth.gr | no-capture | 3625
+ www.sciencedirect.com | no-capture | 3244
+ www.tandfonline.com | no-capture | 3201
+ www.ccsenet.org | no-capture | 2849
+ www.intechopen.com | no-capture | 2813
+ primary-hospital-care.ch | no-capture | 2774
+ www.nature.com | no-capture | 2484
+ www.indianjournals.com | no-capture | 2432
+ journals.aps.org | no-capture | 2197
+ journals.sagepub.com | no-capture | 2064
+ www.episodes.org | no-capture | 1805
+ periodicos.uninove.br | no-capture | 1692
+ escholarship.org | no-capture | 1666
+ (25 rows)
+
+Top "no-pdf-link" domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'no-pdf-link'
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ -----------------------------+-------------+--------
+ plutof.ut.ee | no-pdf-link | 685315
+ www.gbif.org | no-pdf-link | 670647
+ doi.pangaea.de | no-pdf-link | 301984
+ www.plate-archive.org | no-pdf-link | 209218
+ onlinelibrary.wiley.com | no-pdf-link | 84890
+ figshare.com | no-pdf-link | 72892
+ zenodo.org | no-pdf-link | 45768
+ www.tandfonline.com | no-pdf-link | 43848
+ data.mendeley.com | no-pdf-link | 42367
+ springernature.figshare.com | no-pdf-link | 35941
+ dhz.uni-passau.de | no-pdf-link | 29187
+ www.frontiersin.org | no-pdf-link | 17925
+ digital.ucd.ie | no-pdf-link | 16769
+ mr.crossref.org | no-pdf-link | 14999
+ journals.lww.com | no-pdf-link | 12122
+ musewide.aip.de | no-pdf-link | 10854
+ datadryad.org | no-pdf-link | 10686
+ www.jstor.org | no-pdf-link | 9159
+ koreascience.or.kr | no-pdf-link | 9067
+ easy.dans.knaw.nl | no-pdf-link | 8264
+ scielo.conicyt.cl | no-pdf-link | 8069
+ www.degruyter.com | no-pdf-link | 7989
+ www.kci.go.kr | no-pdf-link | 6990
+ www.m-hikari.com | no-pdf-link | 6941
+ cshprotocols.cshlp.org | no-pdf-link | 6553
+ (25 rows)
+
+Top block-ish domains:
+
+ SELECT domain, status, COUNT((domain, status))
+ FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1
+ WHERE t1.domain != ''
+ AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status')
+ GROUP BY domain, status
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ domain | status | count
+ ---------------------------------+---------------------+-------
+ journals.openedition.org | redirect-loop | 30395
+ ieeexplore.ieee.org | redirect-loop | 28926
+ www.degruyter.com | redirect-loop | 18891
+ www.cairn.info | link-loop | 8919
+ www.frontiersin.org | terminal-bad-status | 6786
+ projecteuclid.org | link-loop | 6098
+ www.mdpi.com | terminal-bad-status | 5189
+ medicalforum.ch | terminal-bad-status | 4596
+ jrnl.nau.edu.ua | link-loop | 4238
+ www.revistas.unam.mx | link-loop | 3926
+ journals.aps.org | redirect-loop | 3696
+ www.ijcseonline.org | redirect-loop | 3567
+ www.researchsquare.com | terminal-bad-status | 3453
+ www.persee.fr | terminal-bad-status | 3221
+ www.baltistica.lt | link-loop | 2098
+ osf.io | redirect-loop | 2004
+ seer.ufrgs.br | terminal-bad-status | 2002
+ jtd.amegroups.com | link-loop | 1738
+ www.hindawi.com | terminal-bad-status | 1613
+ linkinghub.elsevier.com | redirect-loop | 1612
+ www.scienceopen.com | terminal-bad-status | 1580
+ atm.amegroups.com | link-loop | 1571
+ scielo.conicyt.cl | terminal-bad-status | 1491
+ repozytorium.ur.edu.pl | redirect-loop | 1279
+ agupubs.onlinelibrary.wiley.com | link-loop | 1182
+ (25 rows)
+
diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt
new file mode 100644
index 0000000..55f0c1e
--- /dev/null
+++ b/sql/stats/2020-05-03_stats.txt
@@ -0,0 +1,418 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 41 GB | 82 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 59 GB | 7604 MB | 66 GB
+ "public"."file_meta" | 31 GB | 28 GB | 59 GB
+ "public"."ingest_request" | 19 GB | 20 GB | 39 GB
+ "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (10 rows)
+
+ Size: 383.93G
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 158059828 | 197346217653010
+ (1 row)
+
+ => 158 million, 197 terabytes
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 157805029
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6040
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1027125
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+
+ unique_sha1 | total
+ -------------+-----------
+ 92936564 | 111022039
+ (1 row)
+
+ => 110 million rows, 92.9 million files
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------------------------------------------------------------+-----------
+ application/pdf | 104178718
+ warc/revisit | 5274410
+ text/xml | 519042
+ text/html | 295523
+ application/octet-stream | 259681
+ unk | 138930
+ application/postscript | 81065
+ application/save | 80765
+ binary/octet-stream | 59804
+ application/x-download | 27083
+ text/plain | 26938
+ application/download | 25125
+ image/pdf | 16095
+ application/force-download | 9004
+ application/x-msdownload | 3711
+ application | 2934
+ application/x-octetstream | 2926
+ multipart/form-data | 2741
+ application/x-pdf | 2444
+ .pdf | 2368
+ application/binary | 1268
+ application/pdf' | 1192
+ pdf | 1113
+ file/unknown | 1086
+ application/unknown | 761
+ file | 753
+ application/blob | 670
+ application/octetstream | 657
+ text/pdf | 549
+ 0 | 417
+ ('application/pdf', | 349
+ application/http;msgtype=response | 251
+ application/doc | 180
+ [...] (wasn't LIMIT 25)
+
+Processed or not:
+
+ # TODO:
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+
+
+ unique_releases | total
+ -----------------+----------
+ 17455441 | 92707544
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057023
+ | 14638425
+ (2 rows)
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 71057074
+ | 3
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | unpaywall | 26244088
+ pdf | mag | 25596658
+ pdf | doi | 15652966
+ pdf | pmc | 2043646
+ pdf | arxiv | 721902
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 103
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | unpaywall | unpaywall | 26244088
+ pdf | mag | mag-corpus | 25596658
+ pdf | doi | fatcat-ingest | 8267308
+ pdf | doi | fatcat-changelog | 3869772
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630719
+ pdf | arxiv | fatcat-changelog | 91157
+ pdf | pmc | fatcat-ingest | 10195
+ pdf | pmc | fatcat-changelog | 4626
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 103
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 15
+ (15 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+-------
+ pdf | mag | 47
+ pdf | unpaywall | 1
+ (2 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | unpaywall | 26244088 | 19968092 | 0.761
+ pdf | mag | 25596658 | 18712912 | 0.731
+ pdf | doi | 15653166 | 2878833 | 0.184
+ pdf | pmc | 2043646 | 1279529 | 0.626
+ pdf | arxiv | 721902 | 592394 | 0.821
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 103 | 82 | 0.796
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 37449502
+ pdf | no-pdf-link | 10908442
+ pdf | no-capture | 5643670
+ pdf | redirect-loop | 4823502
+ pdf | terminal-bad-status | 1715056
+ pdf | link-loop | 1425072
+ pdf | cdx-error | 535365
+ pdf | gateway-timeout | 267654
+ pdf | skip-url-blocklist | 220433
+ pdf | wrong-mimetype | 189804
+ pdf | spn2-cdx-lookup-failure | 103926
+ pdf | spn-error | 101777
+ pdf | wayback-error | 93517
+ pdf | null-body | 87279
+ pdf | invalid-host-resolution | 35305
+ pdf | spn-remote-error | 28888
+ pdf | petabox-error | 12406
+ pdf | spn2-error | 2905
+ pdf | spn2-error:job-failed | 2307
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 745
+ pdf | spn2-error:proxy-error | 438
+ pdf | spn2-error:invalid-url-syntax | 406
+ pdf | spn2-error:soft-time-limit-exceeded | 405
+ pdf | spn2-error:browser-running-error | 274
+ (25 rows)
+
+Failures by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status != 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+
+ ingest_type | domain | status | count
+ -------------+---------------------------------------+---------------------+--------
+ pdf | ssl.fao.org | no-pdf-link | 862277
+ pdf | www.researchgate.net | redirect-loop | 749094
+ pdf | www.e-periodica.ch | no-pdf-link | 747370
+ pdf | ieeexplore.ieee.org | redirect-loop | 707482
+ pdf | plutof.ut.ee | no-pdf-link | 685341
+ pdf | www.gbif.org | no-pdf-link | 670905
+ pdf | dlc.library.columbia.edu | no-pdf-link | 508281
+ pdf | figshare.com | no-pdf-link | 400501
+ pdf | onlinelibrary.wiley.com | no-pdf-link | 399187
+ pdf | watermark.silverchair.com | terminal-bad-status | 357188
+ pdf | www.die-bonn.de | redirect-loop | 352903
+ pdf | academic.oup.com | no-pdf-link | 346828
+ pdf | iopscience.iop.org | terminal-bad-status | 345147
+ pdf | linkinghub.elsevier.com | no-capture | 328434
+ pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206
+ pdf | cyberleninka.ru | link-loop | 309525
+ pdf | www.tandfonline.com | no-pdf-link | 309146
+ pdf | dialnet.unirioja.es | terminal-bad-status | 307572
+ pdf | doi.pangaea.de | no-pdf-link | 304924
+ pdf | journals.sagepub.com | no-pdf-link | 285774
+ pdf | papers.ssrn.com | link-loop | 282415
+ pdf | dialnet.unirioja.es | redirect-loop | 274476
+ pdf | ieeexplore.ieee.org | link-loop | 273607
+ pdf | catalog.paradisec.org.au | redirect-loop | 234653
+ pdf | www.plate-archive.org | no-pdf-link | 209217
+ pdf | zenodo.org | no-pdf-link | 200078
+ pdf | zenodo.org | no-capture | 199025
+ pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084
+ pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039
+ pdf | validate.perfdrive.com | no-pdf-link | 180191
+ (30 rows)
+
+Success by domain:
+
+ SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status))
+ FROM (
+ SELECT
+ ingest_file_result.ingest_type as ingest_type,
+ ingest_file_result.status,
+ substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ ) t1
+ WHERE t1.domain != ''
+ AND t1.status = 'success'
+ GROUP BY ingest_type, domain, status
+ ORDER BY COUNT DESC
+ LIMIT 30;
+
+ ingest_type | domain | status | count
+ -------------+----------------------------+---------+---------
+ pdf | www.jstage.jst.go.jp | success | 2244620
+ pdf | europepmc.org | success | 1284770
+ pdf | link.springer.com | success | 1017998
+ pdf | www.scielo.br | success | 799577
+ pdf | arxiv.org | success | 592622
+ pdf | downloads.hindawi.com | success | 527278
+ pdf | res.mdpi.com | success | 501093
+ pdf | hal.archives-ouvertes.fr | success | 447877
+ pdf | digital.library.unt.edu | success | 404460
+ pdf | www.cambridge.org | success | 394666
+ pdf | dergipark.org.tr | success | 373706
+ pdf | journals.plos.org | success | 296994
+ pdf | watermark.silverchair.com | success | 275562
+ pdf | www.nature.com | success | 263836
+ pdf | cds.cern.ch | success | 223057
+ pdf | www.pnas.org | success | 220488
+ pdf | s3-eu-west-1.amazonaws.com | success | 214558
+ pdf | www.jbc.org | success | 205277
+ pdf | www.redalyc.org | success | 193591
+ pdf | iopscience.iop.org | success | 175796
+ pdf | apps.dtic.mil | success | 170589
+ pdf | zenodo.org | success | 167812
+ pdf | peerj.com | success | 155620
+ pdf | www.biorxiv.org | success | 149337
+ pdf | 210.101.116.28 | success | 145706
+ pdf | www.teses.usp.br | success | 145438
+ pdf | absimage.aps.org | success | 144400
+ pdf | hrcak.srce.hr | success | 134669
+ pdf | www.erudit.org | success | 131771
+ pdf | babel.hathitrust.org | success | 130645
+ (30 rows)
+
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ => NOT RUN, fatcat_file table is way out of date
+
diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt
new file mode 100644
index 0000000..d1993fc
--- /dev/null
+++ b/sql/stats/2020-07-23_stats.txt
@@ -0,0 +1,347 @@
+
+Summary:
+
+- very many more PDFs have been grobid-ed vs. pdf_meta-ed
+- about 1 million file_meta still have partial metadata (eg, no sha256)
+- database size still under 0.5 TByte
+- there are about a million CDX error ingest requests, and hundreds of
+ thousands of SPN errors which could be re-run
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 42 GB | 42 GB | 84 GB
+ "public"."ingest_request" | 34 GB | 39 GB | 73 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."grobid" | 61 GB | 7742 MB | 69 GB
+ "public"."file_meta" | 32 GB | 29 GB | 61 GB
+ "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+ Size: 466.91G
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 161944425 | 204,402,677,360,189
+ (1 row)
+
+ # 161.9 mil; 204 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 161691608
+ application/octet-stream | 154348
+ application/xml | 42170
+ text/html | 18703
+ text/plain | 15989
+ application/gzip | 6484
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ ---------
+ 1015337
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 96537611 | 116281981
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ---------------------------------------------------+-----------
+ application/pdf | 108706978
+ warc/revisit | 5912013
+ text/xml | 519042
+ application/octet-stream | 307782
+ text/html | 295634
+ unk | 156937
+ application/postscript | 81079
+ application/save | 80871
+ binary/octet-stream | 61263
+ text/plain | 31495
+ application/x-download | 30511
+ application/download | 26716
+ image/pdf | 26357
+ application/force-download | 10541
+ multipart/form-data | 5551
+ application/x-msdownload | 3724
+ application/x-octetstream | 3216
+ application | 3171
+ .pdf | 2728
+ application/x-pdf | 2563
+ application/binary | 1306
+ application/pdf' | 1192
+ pdf | 1180
+ [...]
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+
+ total_files | unique_releases
+ -------------+-----------------
+ 95557413 | 18020570
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+
+ status_code | count
+ -------------+----------
+ 200 | 88450610
+ 500 | 7101098
+ -4 | 4133
+ 503 | 110
+
+ SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10;
+
+ status | count
+ ----------------+----------
+ success | 73814297
+ | 14638412
+ error | 7101308
+ error-timeout | 4133
+ bad-grobid-xml | 6
+ (5 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 73813427
+ | 14638425
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 16589669
+ pdf | pmc | 2231113
+ pdf | arxiv | 794693
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 148
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8320832
+ pdf | doi | fatcat-changelog | 4752956
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 630750
+ pdf | pmc | fatcat-ingest | 194781
+ pdf | arxiv | fatcat-changelog | 163924
+ pdf | pmc | fatcat-changelog | 7507
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 148
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 19
+ pdf | arxiv | savepapernow-web | 2
+
+Uncrawled requests by source:
+
+ # TODO: verify this? seems wrong
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | mag | 4097008
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 5346057 | 0.104
+ pdf | mag | 35015357 | 22199271 | 0.634
+ pdf | unpaywall | 27653003 | 22067338 | 0.798
+ pdf | doi | 16589700 | 3207661 | 0.193
+ pdf | pmc | 2231113 | 1696976 | 0.761
+ pdf | arxiv | 794727 | 645607 | 0.812
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 148 | 114 | 0.770
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 46465271
+ pdf | no-capture | 46115869
+ pdf | no-pdf-link | 13877460
+ pdf | redirect-loop | 5943956
+ pdf | terminal-bad-status | 1962754
+ pdf | link-loop | 1630078
+ pdf | cdx-error | 1014409
+ pdf | gateway-timeout | 459340
+ pdf | wrong-mimetype | 321774
+ pdf | skip-url-blocklist | 220629
+ pdf | wayback-error | 220453
+ pdf | spn2-cdx-lookup-failure | 143963
+ pdf | null-body | 113384
+ pdf | spn-error | 101773
+ pdf | invalid-host-resolution | 37367
+ pdf | spn-remote-error | 28886
+ pdf | petabox-error | 22997
+ pdf | spn2-error | 16342
+ pdf | spn2-error:job-failed | 5017
+ pdf | other-mimetype | 2305
+ pdf | redirects-exceeded | 746
+ pdf | spn2-error:soft-time-limit-exceeded | 632
+ pdf | spn2-error:proxy-error | 437
+ pdf | spn2-error:invalid-url-syntax | 417
+ pdf | timeout | 417
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+
+ total_count | release_count
+ -------------+---------------
+ 5862666 | 4728824
+ (1 row)
+
+## PDF Meta
+
+Total rows:
+
+ SELECT COUNT(*) as total_count FROM pdf_meta;
+
+
+ total_count
+ -------------
+ 21961874
+
+By status:
+
+ SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC;
+
+ status | count
+ ----------------+----------
+ success | 21788507
+ parse-error | 78196
+ text-too-large | 60595
+ not-pdf | 31679
+ error-wayback | 2639
+ bad-unicode | 251
+ bad-pdf | 6
+ empty-blob | 1
+ (8 rows)
+
diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt
new file mode 100644
index 0000000..3bc27b0
--- /dev/null
+++ b/sql/stats/2020-09-14_stats.txt
@@ -0,0 +1,340 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 44 GB | 45 GB | 89 GB
+ "public"."grobid" | 66 GB | 8127 MB | 74 GB
+ "public"."ingest_request" | 34 GB | 40 GB | 73 GB
+ "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB
+ "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB
+ "public"."file_meta" | 33 GB | 30 GB | 63 GB
+ "public"."shadow" | 9111 MB | 10204 MB | 19 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (11 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 167021210 | 221982345333674
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+
+ mimetype | count
+ -------------------------------+-----------
+ application/pdf | 166765214
+ application/octet-stream | 155517
+ application/xml | 42170
+ text/html | 18708
+ text/plain | 15990
+ application/gzip | 6491
+ | 6036
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ (10 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62960
+ (1 row)
+
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 102123051 | 126550160
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 116885565
+ warc/revisit | 7951816
+ text/xml | 519042
+ application/octet-stream | 327639
+ text/html | 295725
+ unk | 172491
+ application/postscript | 81095
+ application/save | 80900
+ binary/octet-stream | 61783
+ text/plain | 33684
+ image/pdf | 32856
+ application/x-download | 32418
+ application/download | 27672
+ application/force-download | 10892
+ multipart/form-data | 5750
+ application/x-msdownload | 3832
+ application/x-octetstream | 3516
+ application | 3499
+ .pdf | 3038
+ application/x-pdf | 2701
+ application/binary | 1322
+ pdf | 1232
+ file/unknown | 1199
+ application/pdf' | 1192
+ file | 979
+ (25 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 101494314 | 18919012
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+
+ status_code | count
+ -------------+----------
+ 200 | 93730358
+ 500 | 7759103
+ -4 | 4683
+ 503 | 150
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 80838234
+ | 12892145
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 27653003
+ pdf | doi | 17362763
+ pdf | pmc | 2248854
+ pdf | arxiv | 835400
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 197
+ (9 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 27653003
+ pdf | doi | fatcat-ingest | 8399261
+ pdf | doi | fatcat-changelog | 5449349
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 634665
+ pdf | pmc | fatcat-ingest | 210453
+ pdf | arxiv | fatcat-changelog | 200707
+ pdf | pmc | fatcat-changelog | 9582
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 197
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | doi | savepapernow-web | 21
+ pdf | arxiv | savepapernow-web | 2
+ (17 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 170304
+ pdf | oai | 15287
+ pdf | unpaywall | 1
+ (3 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14144314 | 0.276
+ pdf | mag | 35015357 | 24811947 | 0.709
+ pdf | unpaywall | 27653003 | 22302629 | 0.807
+ pdf | doi | 17363369 | 3533568 | 0.204
+ pdf | pmc | 2248860 | 1713197 | 0.762
+ pdf | arxiv | 835400 | 685219 | 0.820
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 197 | 138 | 0.701
+ (9 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | status | count
+ -------------+-------------------------------------+----------
+ pdf | success | 58265365
+ pdf | no-pdf-link | 27216435
+ pdf | no-capture | 21982611
+ pdf | redirect-loop | 8457469
+ pdf | terminal-bad-status | 2695023
+ pdf | link-loop | 2209672
+ pdf | wrong-mimetype | 767508
+ pdf | gateway-timeout | 548870
+ pdf | cdx-error | 391611
+ pdf | skip-url-blocklist | 220661
+ pdf | null-body | 182215
+ pdf | wayback-error | 146869
+ pdf | spn2-cdx-lookup-failure | 107229
+ pdf | spn-error | 85128
+ pdf | invalid-host-resolution | 37352
+ pdf | petabox-error | 32490
+ pdf | spn2-error | 29212
+ pdf | spn-remote-error | 27927
+ pdf | other-mimetype | 2305
+ pdf | bad-redirect | 1524
+ pdf | spn2-error:job-failed | 1521
+ pdf | timeout | 842
+ pdf | spn2-error:soft-time-limit-exceeded | 793
+ pdf | redirects-exceeded | 748
+ pdf | spn2-error:invalid-url-syntax | 417
+ (25 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*)
+ FROM ingest_file_result
+ WHERE hit = false
+ GROUP BY ingest_type, terminal_status_code
+ ORDER BY COUNT DESC
+ LIMIT 25;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 34064937
+ pdf | | 20514531
+ pdf | 301 | 7271700
+ pdf | 302 | 720632
+ pdf | 503 | 712697
+ pdf | 400 | 444209
+ pdf | 404 | 331495
+ pdf | 403 | 323030
+ pdf | 401 | 259327
+ pdf | 500 | 236122
+ pdf | 303 | 101609
+ pdf | 429 | 47738
+ pdf | 502 | 36183
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14790
+ pdf | 999 | 8996
+ pdf | 307 | 3769
+ pdf | 308 | 3422
+ pdf | 202 | 3228
+ pdf | 520 | 2058
+ pdf | 410 | 1734
+ pdf | 521 | 1033
+ pdf | 504 | 868
+ pdf | 505 | 424
+ (25 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 6600758 | 5213294
+ (1 row)
+
diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt
new file mode 100644
index 0000000..fca76b9
--- /dev/null
+++ b/sql/stats/2021-04-07_stats.txt
@@ -0,0 +1,430 @@
+
+## SQL Table Sizes
+
+ Size: 551.34G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 50 GB | 100 GB
+ "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB
+ "public"."ingest_request" | 39 GB | 45 GB | 83 GB
+ "public"."grobid" | 70 GB | 8613 MB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB
+ "public"."file_meta" | 35 GB | 31 GB | 66 GB
+ "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB
+ "public"."shadow" | 9517 MB | 10 GB | 20 GB
+ "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB
+ "public"."html_meta" | 1172 MB | 10 MB | 1182 MB
+ "public"."pdftrio" | 618 MB | 432 MB | 1051 MB
+ "public"."petabox" | 403 MB | 594 MB | 997 MB
+ (12 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 174200807 | 234313766162033
+ (1 row)
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 173816433
+ application/octet-stream | 155534
+ text/html | 115821
+ application/xml | 42170
+ application/xhtml+xml | 24347
+ text/plain | 15990
+ application/jats+xml | 6899
+ application/gzip | 6491
+ | 6034
+ application/postscript | 4912
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 921
+ application/x-bzip2 | 891
+ image/jpeg | 721
+ image/gif | 389
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297
+ application/x-compress | 272
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ image/png | 88
+ application/mac-binhex40 | 79
+ application/x-dosexec | 51
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ text/rtf | 33
+ application/x-dvi | 29
+ application/x-rar | 29
+ application/vnd.ms-excel | 28
+ message/rfc822 | 26
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 62271
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 113880640 | 141793694
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 131346703
+ warc/revisit | 8394443
+ text/xml | 525481
+ application/octet-stream | 502400
+ text/html | 417579
+ unk | 186703
+ application/postscript | 81095
+ application/save | 80915
+ binary/octet-stream | 66698
+ application/x-download | 35771
+ text/plain | 35606
+ image/pdf | 33904
+ application/download | 29701
+ application/force-download | 16726
+ multipart/form-data | 6878
+ application/x-msdownload | 3843
+ application | 3724
+ application/x-octetstream | 3550
+ .pdf | 3138
+ application/x-pdf | 2780
+ application/binary | 1332
+ pdf | 1247
+ file/unknown | 1200
+ application/pdf' | 1192
+ file | 1108
+ application/unknown | 978
+ application/octetstream | 856
+ application/blob | 673
+ text/pdf | 672
+ 0 | 546
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+ total_files | unique_releases
+ -------------+-----------------
+ 105594307 | 19594878
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+----------
+ 200 | 97714631
+ 500 | 7875192
+ -4 | 4772
+ 503 | 520
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------+----------
+ 0.5.5-fatcat | 84822508
+ | 12892147
+ (2 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | mag | 35015357
+ pdf | unpaywall | 31772942
+ pdf | doi | 23528817
+ pdf | doaj | 4264610
+ html | doaj | 2429003
+ pdf | pmc | 2277417
+ pdf | arxiv | 2143549
+ xml | doaj | 9442
+ html | doi | 3022
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 469
+ html | spn | 9
+ (14 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | mag | mag-corpus | 35015357
+ pdf | unpaywall | unpaywall | 31772942
+ pdf | doi | fatcat-changelog | 11010764
+ pdf | doi | fatcat-ingest | 9002119
+ pdf | doaj | doaj | 4264610
+ pdf | doi | fatcat-ingest-container | 3515873
+ html | doaj | doaj | 2429003
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1767703
+ pdf | arxiv | fatcat-changelog | 375818
+ pdf | pmc | fatcat-ingest | 211264
+ pdf | pmc | fatcat-changelog | 37328
+ xml | doaj | doaj | 9442
+ html | doi | fatcat-ingest | 3018
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 469
+ pdf | doi | savepapernow-web | 74
+ pdf | arxiv | fatcat-ingest-container | 26
+ html | spn | savepapernow-web | 9
+ html | doi | savepapernow-web | 4
+ pdf | arxiv | savepapernow-web | 2
+ (23 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 168462
+ pdf | oai | 15286
+ pdf | doaj | 2068
+ html | doaj | 620
+ pdf | unpaywall | 13
+ (5 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 14163500 | 0.277
+ pdf | mag | 35015357 | 24818176 | 0.709
+ pdf | unpaywall | 31772942 | 25018501 | 0.787
+ pdf | doi | 23529041 | 5773728 | 0.245
+ pdf | doaj | 4264610 | 2851328 | 0.669
+ html | doaj | 2429003 | 122937 | 0.051
+ pdf | pmc | 2277417 | 1736491 | 0.762
+ pdf | arxiv | 2143549 | 2011378 | 0.938
+ xml | doaj | 9442 | 6897 | 0.730
+ html | doi | 3022 | 957 | 0.317
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 469 | 328 | 0.699
+ html | spn | 9 | 2 | 0.222
+ (14 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+--------------------------------+----------
+ pdf | success | 66487928
+ pdf | no-pdf-link | 29279677
+ pdf | no-capture | 22765431
+ pdf | redirect-loop | 9155767
+ pdf | terminal-bad-status | 3549665
+ pdf | link-loop | 2592983
+ html | wrong-scope | 1088793
+ pdf | wrong-mimetype | 792563
+ pdf | gateway-timeout | 478181
+ html | no-capture | 423917
+ pdf | wayback-content-error | 355828
+ pdf | cdx-error | 343862
+ pdf | null-body | 328774
+ pdf | forbidden | 286647
+ pdf | spn2-cdx-lookup-failure | 276769
+ pdf | spn2-wayback-error | 276080
+ pdf | skip-url-blocklist | 265473
+ html | redirect-loop | 212916
+ pdf | not-found | 204367
+ html | unknown-scope | 204112
+ html | html-resource-no-capture | 166034
+ pdf | blocked-cookie | 160336
+ pdf | too-many-redirects | 152984
+ html | success | 123896
+ pdf | wayback-error | 114388
+ html | null-body | 100296
+ pdf | spn2-error:too-many-redirects | 58336
+ html | wayback-content-error | 53926
+ pdf | invalid-host-resolution | 37226
+ pdf | petabox-error | 37177
+ pdf | remote-server-error | 36439
+ pdf | spn2-error | 27556
+ pdf | spn2-error:proxy-error | 25486
+ pdf | read-timeout | 20745
+ html | wrong-mimetype | 18928
+ html | terminal-bad-status | 14059
+ html | petabox-error | 13533
+ pdf | bad-redirect | 7535
+ xml | success | 6897
+ html | cdx-error | 6823
+ pdf | spn2-error:bad-request | 4664
+ pdf | spn2-error:unauthorized | 4391
+ pdf | spn-remote-error | 4206
+ pdf | spn2-error:service-unavailable | 2614
+ pdf | spn2-error:job-failed | 2562
+ xml | null-body | 2353
+ pdf | other-mimetype | 2304
+ pdf | error | 1905
+ html | spn2-cdx-lookup-failure | 1018
+ pdf | redirects-exceeded | 1015
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 36515867
+ pdf | | 22909334
+ pdf | 301 | 7969702
+ html | 200 | 1653303
+ pdf | 503 | 928507
+ pdf | 403 | 823755
+ pdf | 302 | 792842
+ pdf | 400 | 462108
+ html | | 426474
+ pdf | 404 | 422163
+ pdf | 401 | 270611
+ pdf | 500 | 248675
+ html | 301 | 211713
+ pdf | 303 | 109686
+ pdf | 410 | 50648
+ pdf | 502 | 37663
+ pdf | 429 | 31982
+ pdf | 420 | 26603
+ pdf | 509 | 15113
+ pdf | 409 | 14835
+ html | 404 | 9573
+ pdf | 999 | 9296
+ pdf | 307 | 3972
+ pdf | 308 | 3914
+ html | 500 | 3625
+ pdf | 202 | 3515
+ xml | 200 | 2537
+ pdf | 520 | 2072
+ pdf | 206 | 1665
+ pdf | 521 | 1075
+ html | 302 | 1072
+ pdf | 504 | 1000
+ pdf | 412 | 476
+ pdf | 300 | 434
+ pdf | 505 | 429
+ pdf | 406 | 393
+ html | 403 | 382
+ html | 503 | 378
+ pdf | 421 | 298
+ html | 303 | 268
+ pdf | 508 | 195
+ pdf | 226 | 166
+ pdf | 402 | 70
+ html | 502 | 68
+ pdf | 408 | 50
+ pdf | 204 | 34
+ pdf | 416 | 29
+ pdf | 501 | 29
+ pdf | 530 | 27
+ pdf | 507 | 21
+ (50 rows)
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+
+ total_count | release_count
+ -------------+---------------
+ 8514315 | 6401104
+ (1 row)
diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt
new file mode 100644
index 0000000..a8a9cd5
--- /dev/null
+++ b/sql/stats/2021-04-08_table_sizes.txt
@@ -0,0 +1,40 @@
+
+## SQL Table Sizes
+
+ Size: 467.23G
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ -------------------------------+------------+--------------+------------
+ "public"."cdx" | 49 GB | 26 GB | 76 GB
+ "public"."grobid" | 69 GB | 6834 MB | 75 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."ingest_request" | 39 GB | 32 GB | 70 GB
+ "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB
+ "public"."file_meta" | 32 GB | 21 GB | 53 GB
+ "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB
+ "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ (12 rows)
+
diff --git a/sql/stats/README.md b/sql/stats/README.md
new file mode 100644
index 0000000..62e213c
--- /dev/null
+++ b/sql/stats/README.md
@@ -0,0 +1,120 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid;
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+