diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-27 11:57:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-27 11:57:34 -0800 |
commit | d6858033082825cb56a5000e74fe46c4cbbee86c (patch) | |
tree | 9ec4f7601e9aef503f271f4d0394b3c8ac0071e4 /sql/stats/README.md | |
parent | 20cec591d641cf5c6bea7ec7dbf734bc4d8efc1b (diff) | |
download | sandcrawler-d6858033082825cb56a5000e74fe46c4cbbee86c.tar.gz sandcrawler-d6858033082825cb56a5000e74fe46c4cbbee86c.zip |
sandcrawler SQL stats
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r-- | sql/stats/README.md | 13 |
1 files changed, 1 insertions, 12 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md index 62e213c..3161514 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -49,7 +49,7 @@ mimetype counts: Counts: - SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + SELECT COUNT(*) AS total_files FROM grobid; Status? @@ -107,14 +107,3 @@ Failed ingest by terminal status code: SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; -## Fatcat Files - -Count of PDF files that GROBID processed and matched to a release (via -glutton), but no PDF in `fatcat_file`: - - SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count - FROM grobid - LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex - WHERE fatcat_file.sha1hex IS NULL - AND grobid.fatcat_release IS NOT NULL; - |