aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-02 22:01:24 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-02 22:01:24 -0800
commit41d957152b4489312120bb9ec998c134db93fab8 (patch)
tree3b19593461104f1831b48a6dce21727dc5b61dc0 /sql
parent2a5f6a25123d6b725327de844da38df735b04d3f (diff)
downloadsandcrawler-41d957152b4489312120bb9ec998c134db93fab8.tar.gz
sandcrawler-41d957152b4489312120bb9ec998c134db93fab8.zip
more SQL commands
Diffstat (limited to 'sql')
-rw-r--r--sql/stats/README.md15
1 files changed, 15 insertions, 0 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md
index 79a4671..ea61fa0 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -31,6 +31,10 @@ Top mimetypes:
SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
## CDX
Total and unique-by-sha1 counts:
@@ -97,3 +101,14 @@ Ingest result by status:
SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
+## Fatcat Files
+
+Count of PDF files that GROBID processed and matched to a release (via
+glutton), but no PDF in `fatcat_file`:
+
+ SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
+ FROM grobid
+ LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
+ WHERE fatcat_file.sha1hex IS NULL
+ AND grobid.fatcat_release IS NOT NULL;
+