aboutsummaryrefslogtreecommitdiffstats
path: root/sql/stats/README.md
diff options
context:
space:
mode:
Diffstat (limited to 'sql/stats/README.md')
-rw-r--r--sql/stats/README.md33
1 files changed, 11 insertions, 22 deletions
diff --git a/sql/stats/README.md b/sql/stats/README.md
index 52642f6..3161514 100644
--- a/sql/stats/README.md
+++ b/sql/stats/README.md
@@ -29,7 +29,7 @@ Counts and total file size:
Top mimetypes:
- SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10;
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
Missing full metadata:
@@ -43,25 +43,21 @@ Total and unique-by-sha1 counts:
mimetype counts:
- SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC;
-
-Processed or not:
-
- # TODO:
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
## GROBID
Counts:
- SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid;
+ SELECT COUNT(*) AS total_files FROM grobid;
Status?
- SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10;
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
What version used?
- SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10;
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
## Petabox
@@ -75,7 +71,7 @@ Requests by source:
SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
- SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25;
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
Uncrawled requests by source:
@@ -86,7 +82,7 @@ Uncrawled requests by source:
ON ingest_request.base_url = ingest_file_result.base_url
AND ingest_request.ingest_type = ingest_file_result.ingest_type
WHERE ingest_file_result.base_url IS NULL
- GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25;
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
Results by source:
@@ -101,20 +97,13 @@ Results by source:
ON ingest_request.base_url = ingest_file_result.base_url
AND ingest_request.ingest_type = ingest_file_result.ingest_type
AND ingest_file_result.ingest_type IS NOT NULL
- GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25;
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
Ingest result by status:
- SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25;
-
-## Fatcat Files
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
-Count of PDF files that GROBID processed and matched to a release (via
-glutton), but no PDF in `fatcat_file`:
+Failed ingest by terminal status code:
- SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count
- FROM grobid
- LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex
- WHERE fatcat_file.sha1hex IS NULL
- AND grobid.fatcat_release IS NOT NULL;
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;