diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:02:38 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:02:38 -0800 |
commit | ad4c2c97e0ef0153b1161b2b71dadeee3e88a631 (patch) | |
tree | 633816c40512c3fbd337ee385a50639b989c55c2 | |
parent | c3a3fa053fc4a2211618a69b349c77b1a04e6b1f (diff) | |
download | sandcrawler-ad4c2c97e0ef0153b1161b2b71dadeee3e88a631.tar.gz sandcrawler-ad4c2c97e0ef0153b1161b2b71dadeee3e88a631.zip |
sandcrawler-db extra stats
-rw-r--r-- | sql/stats/2020-01-31_supplement.txt | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt new file mode 100644 index 0000000..6bd43ea --- /dev/null +++ b/sql/stats/2020-01-31_supplement.txt @@ -0,0 +1,42 @@ + +How many file_meta still missing core metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 1,130,915 + +Great! Not many. + +And are in petabox? + + SELECT COUNT(*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL; + => 1,149,194 + +Almost all; maybe just some CDX fetch failures or something in there. So, +should run these on, eg, grobid2-vm. + + COPY ( + SELECT row_to_json(petabox.*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL + ) TO '/grande/snapshots/dump_grobid_petabox_todo.json'; + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a +couple million files): + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | count + -------------+--------- + 5072452 | 4130405 + |