From 41d957152b4489312120bb9ec998c134db93fab8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 2 Feb 2020 22:01:24 -0800 Subject: more SQL commands --- sql/stats/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sql/stats/README.md b/sql/stats/README.md index 79a4671..ea61fa0 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -31,6 +31,10 @@ Top mimetypes: SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + ## CDX Total and unique-by-sha1 counts: @@ -97,3 +101,14 @@ Ingest result by status: SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + -- cgit v1.2.3