From e0ad2e3be5286b2703df8f0a98b450658e28d28b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 12 Nov 2019 13:22:18 -0800 Subject: SQL stats and commands (mostly from sept 2019) --- sql/dump_regrobid_pdf.sql | 15 +++++++++++++ sql/dump_ungrobid_pdf.sql | 15 +++++++++++++ sql/random_queries.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++ sql/table_sizes.md | 11 ++++++++++ 4 files changed, 96 insertions(+) create mode 100644 sql/dump_regrobid_pdf.sql create mode 100644 sql/dump_ungrobid_pdf.sql create mode 100644 sql/random_queries.md create mode 100644 sql/table_sizes.md (limited to 'sql') diff --git a/sql/dump_regrobid_pdf.sql b/sql/dump_regrobid_pdf.sql new file mode 100644 index 0000000..b846834 --- /dev/null +++ b/sql/dump_regrobid_pdf.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) +) +TO STDOUT +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql new file mode 100644 index 0000000..3e6d782 --- /dev/null +++ b/sql/dump_ungrobid_pdf.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2019-09-23.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex) +) +TO STDOUT +WITH NULL ''; + +ROLLBACK; diff --git a/sql/random_queries.md b/sql/random_queries.md new file mode 100644 index 0000000..6ae651a --- /dev/null +++ b/sql/random_queries.md @@ -0,0 +1,55 @@ + +Basic stats (2019-09-23): + + SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 28,023,760 + => Time: 253897.213 ms (04:13.897) + + SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 22,816,087 + => Time: 287097.944 ms (04:47.098) + + SELECT COUNT(*) FROM grobid. + => 56,196,992 + + SELECT COUNT(DISTINCT sha1hex) FROM cdx; + => 64,348,277 + => Time: 572383.931 ms (09:32.384) + + SELECT COUNT(*) FROM cdx; + => 74,796,777 + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + => Time: 189067.335 ms (03:09.067) + + mimetype | count + ------------------------+---------- + application/pdf | 51049905 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +Time: 189067.335 ms (03:09.067) + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC; + + status_code | count + -------------+---------- + 200 | 56196992 + + compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount: + + 200 49567139 + 400 3464503 + 409 691917 + 500 247028 + 503 123 + + SELECT row_to_json(cdx) FROM cdx LIMIT 5; + + SELECT row_to_json(r) FROM ( + SELECT url, datetime FROM cdx + ) r + LIMIT 5; + diff --git a/sql/table_sizes.md b/sql/table_sizes.md new file mode 100644 index 0000000..3596b2b --- /dev/null +++ b/sql/table_sizes.md @@ -0,0 +1,11 @@ + +## September 2019 + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."cdx" | 31 GB | 27 GB | 58 GB + "public"."file_meta" | 13 GB | 6500 MB | 19 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."grobid" | 4994 MB | 6678 MB | 11 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB -- cgit v1.2.3