diff options
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r-- | sql/stats/README.md | 33 |
1 files changed, 11 insertions, 22 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md index 52642f6..3161514 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -29,7 +29,7 @@ Counts and total file size: Top mimetypes: - SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; Missing full metadata: @@ -43,25 +43,21 @@ Total and unique-by-sha1 counts: mimetype counts: - SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; - -Processed or not: - - # TODO: + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; ## GROBID Counts: - SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + SELECT COUNT(*) AS total_files FROM grobid; Status? - SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; What version used? - SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; ## Petabox @@ -75,7 +71,7 @@ Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; - SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; Uncrawled requests by source: @@ -86,7 +82,7 @@ Uncrawled requests by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type WHERE ingest_file_result.base_url IS NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; Results by source: @@ -101,20 +97,13 @@ Results by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type AND ingest_file_result.ingest_type IS NOT NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; Ingest result by status: - SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; - -## Fatcat Files + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; -Count of PDF files that GROBID processed and matched to a release (via -glutton), but no PDF in `fatcat_file`: +Failed ingest by terminal status code: - SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count - FROM grobid - LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex - WHERE fatcat_file.sha1hex IS NULL - AND grobid.fatcat_release IS NOT NULL; + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; |