diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-15 00:20:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-15 00:20:57 -0700 |
commit | c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8 (patch) | |
tree | a5c89b810c74e65a67c66b412daffbb555792016 /sql/stats/README.md | |
parent | c714ecdcd8aa8bb39b1b46860944b6cace7f5077 (diff) | |
download | sandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.tar.gz sandcrawler-c1f60b1e31c8ff3b25c7cdcd442ccd91512d1db8.zip |
updated sandcrawler-db stats
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r-- | sql/stats/README.md | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md index 89deec2..2e9eae5 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -29,7 +29,7 @@ Counts and total file size: Top mimetypes: - SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 20; Missing full metadata: @@ -43,11 +43,7 @@ Total and unique-by-sha1 counts: mimetype counts: - SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; - -Processed or not: - - # TODO: + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; ## GROBID @@ -107,6 +103,10 @@ Ingest result by status: SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 25; + ## Fatcat Files Count of PDF files that GROBID processed and matched to a release (via |