## SQL Table Sizes SELECT table_name, pg_size_pretty(table_size) AS table_size, pg_size_pretty(indexes_size) AS indexes_size, pg_size_pretty(total_size) AS total_size FROM ( SELECT table_name, pg_table_size(table_name) AS table_size, pg_indexes_size(table_name) AS indexes_size, pg_total_relation_size(table_name) AS total_size FROM ( SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name FROM information_schema.tables WHERE table_schema = 'public' ) AS all_tables ORDER BY total_size DESC ) AS pretty_sizes; table_name | table_size | indexes_size | total_size ------------------------------------+------------+--------------+------------ "public"."crossref" | 459 GB | 10 GB | 470 GB "public"."grobid" | 98 GB | 13 GB | 112 GB "public"."cdx" | 62 GB | 44 GB | 106 GB "public"."ingest_request" | 51 GB | 50 GB | 101 GB "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB "public"."file_meta" | 39 GB | 39 GB | 78 GB "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB "public"."shadow" | 9517 MB | 8026 MB | 17 GB "public"."html_meta" | 7469 MB | 66 MB | 7535 MB "public"."petabox" | 403 MB | 461 MB | 864 MB "public"."pdftrio" | 550 MB | 297 MB | 847 MB "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes (16 rows) ## File Metadata Counts and total file size: SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; total_count | total_size -------------+----------------- 198175106 | 282695671015403 (1 row) 198 million files, 282 TBytes. Top mimetypes: SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; mimetype | count ---------------------------------------------------------------------------+----------- application/pdf | 197021437 text/html | 830331 application/octet-stream | 186669 application/xml | 42170 application/xhtml+xml | 38207 text/plain | 16471 application/jats+xml | 10385 application/gzip | 6681 | 6032 application/postscript | 4916 image/jpeg | 4522 application/vnd.ms-powerpoint | 1672 application/msword | 946 application/x-bzip2 | 891 image/png | 659 application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 application/x-dosexec | 404 image/gif | 395 application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 application/x-compress | 274 video/mp4 | 218 application/zip | 131 application/CDFV2-unknown | 99 application/mac-binhex40 | 79 application/zlib | 68 text/x-tex | 44 application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 text/x-php | 37 image/g3fax | 35 text/rtf | 33 (30 rows) Missing full metadata: SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; count ------- 12800 (1 row) ## CDX Total and unique-by-sha1 counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; unique_sha1 | total -------------+----------- 137283420 | 172140506 (1 row) mimetype counts: SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; mimetype | count ----------------------------+----------- application/pdf | 157465613 warc/revisit | 11337336 text/html | 1137208 application/octet-stream | 950380 text/xml | 528965 unk | 253294 application/postscript | 81130 application/save | 81069 binary/octet-stream | 68942 application/x-download | 42717 application/download | 40628 image/pdf | 39904 text/plain | 36445 application/force-download | 24148 multipart/form-data | 10972 application | 5409 application/x-octetstream | 5192 application/x-msdownload | 3854 .pdf | 3518 application/x-pdf | 3061 application/octet | 1792 pdf | 1757 application/binary | 1399 file | 1373 file/unknown | 1345 application/pdf' | 1196 application/octetstream | 1087 application/unknown | 1005 0 | 773 text/pdf | 729 (30 rows) ## GROBID Counts: SELECT COUNT(*) AS total_files FROM grobid; total_files ------------- 129001717 (1 row) Status? SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; status_code | count -------------+----------- 200 | 120797098 500 | 8198783 -4 | 5802 503 | 36 (4 rows) What version used? SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; grobid_version | count ----------------------+---------- 0.7.0-131-gdd0251d9f | 60469462 0.5.5-fatcat | 47472904 | 12665498 0.7.0-104-gbeebd9a6b | 189243 (4 rows) ## Petabox Counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; unique_sha1 | total -------------+--------- 2868825 | 2887834 (1 row) ## Ingests Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; ingest_type | link_source | count -------------+-----------------+---------- pdf | oai | 51185088 pdf | unpaywall | 43932525 pdf | doi | 43852308 pdf | mag | 43701948 pdf | doaj | 6534341 html | doaj | 3987669 pdf | arxiv | 2784589 pdf | pmc | 2439181 pdf | dblp | 631716 html | doi | 126699 xml | doaj | 23066 pdf | cnki_covid19 | 2034 pdf | spn | 1026 pdf | wanfang_covid19 | 975 html | spn | 65 xml | spn | 2 xml | doi | 1 (17 rows) SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; ingest_type | link_source | ingest_request_source | count -------------+-----------------+-------------------------+---------- pdf | oai | metha-bulk | 51185088 pdf | unpaywall | unpaywall | 43932525 pdf | mag | mag-corpus | 43701948 pdf | doi | fatcat-changelog | 24742500 pdf | doi | fatcat-ingest | 15592121 pdf | doaj | doaj | 6484737 html | doaj | doaj | 3987468 pdf | doi | fatcat-ingest-container | 3515873 pdf | pmc | fatcat-ingest-container | 2028825 pdf | arxiv | fatcat-ingest | 1984766 pdf | arxiv | fatcat-changelog | 799793 pdf | dblp | dblp | 631716 pdf | pmc | fatcat-ingest | 297980 html | doi | fatcat-ingest | 121508 pdf | pmc | fatcat-changelog | 112376 pdf | doaj | fatcat-changelog | 47181 xml | doaj | doaj | 23066 html | doi | fatcat-changelog | 5129 pdf | doaj | fatcat-ingest | 2423 pdf | cnki_covid19 | scrape-covid19 | 2034 pdf | doi | savepapernow-web | 1814 pdf | spn | savepapernow-web | 1026 pdf | wanfang_covid19 | scrape-covid19 | 975 html | doaj | fatcat-ingest | 201 html | spn | savepapernow-web | 65 html | doi | savepapernow-web | 62 pdf | arxiv | fatcat-ingest-container | 26 pdf | arxiv | savepapernow-web | 4 xml | spn | savepapernow-web | 2 xml | doi | savepapernow-web | 1 (30 rows) Uncrawled requests by source: # TODO: verify this? SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) FROM ingest_request LEFT JOIN ingest_file_result ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type WHERE ingest_file_result.base_url IS NULL GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; ingest_type | link_source | count -------------+-------------+-------- pdf | mag | 167653 pdf | doaj | 81517 pdf | oai | 15282 html | doaj | 1791 pdf | unpaywall | 270 pdf | doi | 22 (6 rows) Results by source: SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) as attempts, COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction FROM ingest_request LEFT JOIN ingest_file_result ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type AND ingest_file_result.ingest_type IS NOT NULL GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; ingest_type | link_source | attempts | hits | fraction -------------+-----------------+----------+----------+---------- pdf | oai | 51185088 | 16024068 | 0.313 pdf | unpaywall | 43932525 | 36045446 | 0.820 pdf | doi | 43852308 | 14956080 | 0.341 pdf | mag | 43701948 | 32768484 | 0.750 pdf | doaj | 6534341 | 4704066 | 0.720 html | doaj | 3987669 | 778165 | 0.195 pdf | arxiv | 2784589 | 2419941 | 0.869 pdf | pmc | 2439181 | 1897671 | 0.778 pdf | dblp | 631716 | 305142 | 0.483 html | doi | 126699 | 75754 | 0.598 xml | doaj | 23066 | 10381 | 0.450 pdf | cnki_covid19 | 2034 | 0 | 0.000 pdf | spn | 1026 | 778 | 0.758 pdf | wanfang_covid19 | 975 | 764 | 0.784 html | spn | 65 | 13 | 0.200 xml | spn | 2 | 1 | 0.500 xml | doi | 1 | 0 | 0.000 (17 rows) Ingest result by status: SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; ingest_type | status | count -------------+-------------------------------+---------- pdf | success | 94887295 pdf | no-pdf-link | 33960080 pdf | no-capture | 20893916 pdf | terminal-bad-status | 6973765 pdf | redirect-loop | 5775175 pdf | link-loop | 4095424 pdf | skip-url-blocklist | 4037518 pdf | blocked-cookie | 3508762 html | wrong-scope | 1783694 pdf | wrong-mimetype | 1379673 html | success | 853762 pdf | gateway-timeout | 635170 html | no-capture | 381283 pdf | wayback-content-error | 356694 pdf | cdx-error | 347700 pdf | null-body | 336166 html | unknown-scope | 321874 html | html-resource-no-capture | 294294 pdf | forbidden | 291127 pdf | not-found | 274343 pdf | too-many-redirects | 264494 component | wrong-mimetype | 196680 component | spn2-cdx-lookup-failure | 173615 component | spn2-backoff | 115840 html | terminal-bad-status | 106264 html | null-body | 100296 pdf | wayback-error | 94748 html | blocked-cookie | 88537 component | no-capture | 75278 pdf | empty-blob | 61157 pdf | bad-redirect | 58680 pdf | skip-wall | 57751 pdf | spn2-error:too-many-redirects | 52873 html | spn2-backoff | 50577 pdf | remote-server-error | 41282 pdf | invalid-host-resolution | 38864 pdf | read-timeout | 37071 pdf | spn2-cdx-lookup-failure | 34229 html | wrong-mimetype | 33643 pdf | spn2-backoff | 32437 pdf | petabox-error | 31006 html | wayback-content-error | 28034 component | spn2-error | 27044 pdf | spn2-error:unknown | 25810 component | gateway-timeout | 25215 pdf | body-too-large | 21721 html | petabox-error | 18313 html | empty-blob | 14393 html | redirect-loop | 13404 component | blocked-cookie | 12287 (50 rows) Failed ingest by terminal status code: SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; ingest_type | terminal_status_code | count -------------+----------------------+---------- pdf | 200 | 45052391 pdf | | 26117481 pdf | 301 | 4814786 html | 200 | 2684821 pdf | 403 | 1871088 pdf | 404 | 1254259 pdf | 302 | 898728 pdf | 503 | 867548 pdf | 401 | 851205 pdf | 429 | 741869 pdf | 400 | 624519 component | | 456915 html | | 442051 pdf | 500 | 283700 component | 200 | 197510 pdf | 410 | 120647 pdf | 303 | 107947 html | 404 | 80114 pdf | 420 | 26722 pdf | 502 | 19500 pdf | 409 | 15499 html | 429 | 15208 pdf | 509 | 15167 pdf | 999 | 12186 pdf | 202 | 11535 html | 301 | 10213 xml | | 10018 pdf | 307 | 8657 pdf | 402 | 8338 pdf | 412 | 8064 pdf | 308 | 6479 html | 500 | 4746 xml | 200 | 2668 pdf | 520 | 2496 html | 302 | 2289 pdf | 521 | 2257 html | 202 | 2177 pdf | 206 | 1961 html | 403 | 1775 pdf | 504 | 1187 pdf | 421 | 1148 html | 303 | 1112 pdf | 406 | 1109 pdf | 204 | 772 pdf | 432 | 745 pdf | 405 | 633 html | 400 | 632 pdf | 426 | 515 pdf | 508 | 503 pdf | 505 | 469 (50 rows)