From 4589f574ae8fb29d959ff60e99a15098fee8a15a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 7 Sep 2022 17:53:50 -0700 Subject: sandcrawler SQL-based status (sept 2022) --- sql/stats/2022-09-06_stats.txt | 438 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 sql/stats/2022-09-06_stats.txt (limited to 'sql/stats/2022-09-06_stats.txt') diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 62 GB | 44 GB | 106 GB + "public"."ingest_request" | 51 GB | 50 GB | 101 GB + "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB + "public"."file_meta" | 39 GB | 39 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7469 MB | 66 MB | 7535 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 198175106 | 282695671015403 + (1 row) + + 198 million files, 282 TBytes. + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 197021437 + text/html | 830331 + application/octet-stream | 186669 + application/xml | 42170 + application/xhtml+xml | 38207 + text/plain | 16471 + application/jats+xml | 10385 + application/gzip | 6681 + | 6032 + application/postscript | 4916 + image/jpeg | 4522 + application/vnd.ms-powerpoint | 1672 + application/msword | 946 + application/x-bzip2 | 891 + image/png | 659 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 + application/x-compress | 274 + video/mp4 | 218 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12800 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 137283420 | 172140506 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 157465613 + warc/revisit | 11337336 + text/html | 1137208 + application/octet-stream | 950380 + text/xml | 528965 + unk | 253294 + application/postscript | 81130 + application/save | 81069 + binary/octet-stream | 68942 + application/x-download | 42717 + application/download | 40628 + image/pdf | 39904 + text/plain | 36445 + application/force-download | 24148 + multipart/form-data | 10972 + application | 5409 + application/x-octetstream | 5192 + application/x-msdownload | 3854 + .pdf | 3518 + application/x-pdf | 3061 + application/octet | 1792 + pdf | 1757 + application/binary | 1399 + file | 1373 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1087 + application/unknown | 1005 + 0 | 773 + text/pdf | 729 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 129001717 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 120797098 + 500 | 8198783 + -4 | 5802 + 503 | 36 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 60469462 + 0.5.5-fatcat | 47472904 + | 12665498 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | doi | 43852308 + pdf | mag | 43701948 + pdf | doaj | 6534341 + html | doaj | 3987669 + pdf | arxiv | 2784589 + pdf | pmc | 2439181 + pdf | dblp | 631716 + html | doi | 126699 + xml | doaj | 23066 + pdf | cnki_covid19 | 2034 + pdf | spn | 1026 + pdf | wanfang_covid19 | 975 + html | spn | 65 + xml | spn | 2 + xml | doi | 1 + (17 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 24742500 + pdf | doi | fatcat-ingest | 15592121 + pdf | doaj | doaj | 6484737 + html | doaj | doaj | 3987468 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 799793 + pdf | dblp | dblp | 631716 + pdf | pmc | fatcat-ingest | 297980 + html | doi | fatcat-ingest | 121508 + pdf | pmc | fatcat-changelog | 112376 + pdf | doaj | fatcat-changelog | 47181 + xml | doaj | doaj | 23066 + html | doi | fatcat-changelog | 5129 + pdf | doaj | fatcat-ingest | 2423 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1814 + pdf | spn | savepapernow-web | 1026 + pdf | wanfang_covid19 | scrape-covid19 | 975 + html | doaj | fatcat-ingest | 201 + html | spn | savepapernow-web | 65 + html | doi | savepapernow-web | 62 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 4 + xml | spn | savepapernow-web | 2 + xml | doi | savepapernow-web | 1 + (30 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 167653 + pdf | doaj | 81517 + pdf | oai | 15282 + html | doaj | 1791 + pdf | unpaywall | 270 + pdf | doi | 22 + (6 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 16024068 | 0.313 + pdf | unpaywall | 43932525 | 36045446 | 0.820 + pdf | doi | 43852308 | 14956080 | 0.341 + pdf | mag | 43701948 | 32768484 | 0.750 + pdf | doaj | 6534341 | 4704066 | 0.720 + html | doaj | 3987669 | 778165 | 0.195 + pdf | arxiv | 2784589 | 2419941 | 0.869 + pdf | pmc | 2439181 | 1897671 | 0.778 + pdf | dblp | 631716 | 305142 | 0.483 + html | doi | 126699 | 75754 | 0.598 + xml | doaj | 23066 | 10381 | 0.450 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | spn | 1026 | 778 | 0.758 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + html | spn | 65 | 13 | 0.200 + xml | spn | 2 | 1 | 0.500 + xml | doi | 1 | 0 | 0.000 + (17 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 94887295 + pdf | no-pdf-link | 33960080 + pdf | no-capture | 20893916 + pdf | terminal-bad-status | 6973765 + pdf | redirect-loop | 5775175 + pdf | link-loop | 4095424 + pdf | skip-url-blocklist | 4037518 + pdf | blocked-cookie | 3508762 + html | wrong-scope | 1783694 + pdf | wrong-mimetype | 1379673 + html | success | 853762 + pdf | gateway-timeout | 635170 + html | no-capture | 381283 + pdf | wayback-content-error | 356694 + pdf | cdx-error | 347700 + pdf | null-body | 336166 + html | unknown-scope | 321874 + html | html-resource-no-capture | 294294 + pdf | forbidden | 291127 + pdf | not-found | 274343 + pdf | too-many-redirects | 264494 + component | wrong-mimetype | 196680 + component | spn2-cdx-lookup-failure | 173615 + component | spn2-backoff | 115840 + html | terminal-bad-status | 106264 + html | null-body | 100296 + pdf | wayback-error | 94748 + html | blocked-cookie | 88537 + component | no-capture | 75278 + pdf | empty-blob | 61157 + pdf | bad-redirect | 58680 + pdf | skip-wall | 57751 + pdf | spn2-error:too-many-redirects | 52873 + html | spn2-backoff | 50577 + pdf | remote-server-error | 41282 + pdf | invalid-host-resolution | 38864 + pdf | read-timeout | 37071 + pdf | spn2-cdx-lookup-failure | 34229 + html | wrong-mimetype | 33643 + pdf | spn2-backoff | 32437 + pdf | petabox-error | 31006 + html | wayback-content-error | 28034 + component | spn2-error | 27044 + pdf | spn2-error:unknown | 25810 + component | gateway-timeout | 25215 + pdf | body-too-large | 21721 + html | petabox-error | 18313 + html | empty-blob | 14393 + html | redirect-loop | 13404 + component | blocked-cookie | 12287 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 45052391 + pdf | | 26117481 + pdf | 301 | 4814786 + html | 200 | 2684821 + pdf | 403 | 1871088 + pdf | 404 | 1254259 + pdf | 302 | 898728 + pdf | 503 | 867548 + pdf | 401 | 851205 + pdf | 429 | 741869 + pdf | 400 | 624519 + component | | 456915 + html | | 442051 + pdf | 500 | 283700 + component | 200 | 197510 + pdf | 410 | 120647 + pdf | 303 | 107947 + html | 404 | 80114 + pdf | 420 | 26722 + pdf | 502 | 19500 + pdf | 409 | 15499 + html | 429 | 15208 + pdf | 509 | 15167 + pdf | 999 | 12186 + pdf | 202 | 11535 + html | 301 | 10213 + xml | | 10018 + pdf | 307 | 8657 + pdf | 402 | 8338 + pdf | 412 | 8064 + pdf | 308 | 6479 + html | 500 | 4746 + xml | 200 | 2668 + pdf | 520 | 2496 + html | 302 | 2289 + pdf | 521 | 2257 + html | 202 | 2177 + pdf | 206 | 1961 + html | 403 | 1775 + pdf | 504 | 1187 + pdf | 421 | 1148 + html | 303 | 1112 + pdf | 406 | 1109 + pdf | 204 | 772 + pdf | 432 | 745 + pdf | 405 | 633 + html | 400 | 632 + pdf | 426 | 515 + pdf | 508 | 503 + pdf | 505 | 469 + (50 rows) -- cgit v1.2.3