## SQL Table Sizes Size: 551.34G SELECT table_name, pg_size_pretty(table_size) AS table_size, pg_size_pretty(indexes_size) AS indexes_size, pg_size_pretty(total_size) AS total_size FROM ( SELECT table_name, pg_table_size(table_name) AS table_size, pg_indexes_size(table_name) AS indexes_size, pg_total_relation_size(table_name) AS total_size FROM ( SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name FROM information_schema.tables WHERE table_schema = 'public' ) AS all_tables ORDER BY total_size DESC ) AS pretty_sizes; table_name | table_size | indexes_size | total_size -------------------------------+------------+--------------+------------ "public"."cdx" | 49 GB | 50 GB | 100 GB "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB "public"."ingest_request" | 39 GB | 45 GB | 83 GB "public"."grobid" | 70 GB | 8613 MB | 78 GB "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB "public"."file_meta" | 35 GB | 31 GB | 66 GB "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB "public"."shadow" | 9517 MB | 10 GB | 20 GB "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB "public"."html_meta" | 1172 MB | 10 MB | 1182 MB "public"."pdftrio" | 618 MB | 432 MB | 1051 MB "public"."petabox" | 403 MB | 594 MB | 997 MB (12 rows) ## File Metadata Counts and total file size: SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; total_count | total_size -------------+----------------- 174200807 | 234313766162033 (1 row) Top mimetypes: SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; mimetype | count ---------------------------------------------------------------------------+----------- application/pdf | 173816433 application/octet-stream | 155534 text/html | 115821 application/xml | 42170 application/xhtml+xml | 24347 text/plain | 15990 application/jats+xml | 6899 application/gzip | 6491 | 6034 application/postscript | 4912 application/vnd.ms-powerpoint | 1672 application/msword | 921 application/x-bzip2 | 891 image/jpeg | 721 image/gif | 389 application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 application/x-compress | 272 application/zip | 131 application/CDFV2-unknown | 99 image/png | 88 application/mac-binhex40 | 79 application/x-dosexec | 51 text/x-tex | 44 application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 text/x-php | 37 text/rtf | 33 application/x-dvi | 29 application/x-rar | 29 application/vnd.ms-excel | 28 message/rfc822 | 26 (30 rows) Missing full metadata: SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; count ------- 62271 (1 row) ## CDX Total and unique-by-sha1 counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; unique_sha1 | total -------------+----------- 113880640 | 141793694 (1 row) mimetype counts: SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; mimetype | count ----------------------------+----------- application/pdf | 131346703 warc/revisit | 8394443 text/xml | 525481 application/octet-stream | 502400 text/html | 417579 unk | 186703 application/postscript | 81095 application/save | 80915 binary/octet-stream | 66698 application/x-download | 35771 text/plain | 35606 image/pdf | 33904 application/download | 29701 application/force-download | 16726 multipart/form-data | 6878 application/x-msdownload | 3843 application | 3724 application/x-octetstream | 3550 .pdf | 3138 application/x-pdf | 2780 application/binary | 1332 pdf | 1247 file/unknown | 1200 application/pdf' | 1192 file | 1108 application/unknown | 978 application/octetstream | 856 application/blob | 673 text/pdf | 672 0 | 546 (30 rows) ## GROBID Counts: SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; total_files | unique_releases -------------+----------------- 105594307 | 19594878 (1 row) Status? SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; status_code | count -------------+---------- 200 | 97714631 500 | 7875192 -4 | 4772 503 | 520 (4 rows) What version used? SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; grobid_version | count ----------------+---------- 0.5.5-fatcat | 84822508 | 12892147 (2 rows) ## Petabox Counts: SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; unique_sha1 | total -------------+--------- 2868825 | 2887834 (1 row) ## Ingests Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; ingest_type | link_source | count -------------+-----------------+---------- pdf | oai | 51185088 pdf | mag | 35015357 pdf | unpaywall | 31772942 pdf | doi | 23528817 pdf | doaj | 4264610 html | doaj | 2429003 pdf | pmc | 2277417 pdf | arxiv | 2143549 xml | doaj | 9442 html | doi | 3022 pdf | cnki_covid19 | 2034 pdf | wanfang_covid19 | 975 pdf | spn | 469 html | spn | 9 (14 rows) SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; ingest_type | link_source | ingest_request_source | count -------------+-----------------+-------------------------+---------- pdf | oai | metha-bulk | 51185088 pdf | mag | mag-corpus | 35015357 pdf | unpaywall | unpaywall | 31772942 pdf | doi | fatcat-changelog | 11010764 pdf | doi | fatcat-ingest | 9002119 pdf | doaj | doaj | 4264610 pdf | doi | fatcat-ingest-container | 3515873 html | doaj | doaj | 2429003 pdf | pmc | fatcat-ingest-container | 2028825 pdf | arxiv | fatcat-ingest | 1767703 pdf | arxiv | fatcat-changelog | 375818 pdf | pmc | fatcat-ingest | 211264 pdf | pmc | fatcat-changelog | 37328 xml | doaj | doaj | 9442 html | doi | fatcat-ingest | 3018 pdf | cnki_covid19 | scrape-covid19 | 2034 pdf | wanfang_covid19 | scrape-covid19 | 975 pdf | spn | savepapernow-web | 469 pdf | doi | savepapernow-web | 74 pdf | arxiv | fatcat-ingest-container | 26 html | spn | savepapernow-web | 9 html | doi | savepapernow-web | 4 pdf | arxiv | savepapernow-web | 2 (23 rows) Uncrawled requests by source: # TODO: verify this? SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) FROM ingest_request LEFT JOIN ingest_file_result ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type WHERE ingest_file_result.base_url IS NULL GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; ingest_type | link_source | count -------------+-------------+-------- pdf | mag | 168462 pdf | oai | 15286 pdf | doaj | 2068 html | doaj | 620 pdf | unpaywall | 13 (5 rows) Results by source: SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) as attempts, COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction FROM ingest_request LEFT JOIN ingest_file_result ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type AND ingest_file_result.ingest_type IS NOT NULL GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; ingest_type | link_source | attempts | hits | fraction -------------+-----------------+----------+----------+---------- pdf | oai | 51185088 | 14163500 | 0.277 pdf | mag | 35015357 | 24818176 | 0.709 pdf | unpaywall | 31772942 | 25018501 | 0.787 pdf | doi | 23529041 | 5773728 | 0.245 pdf | doaj | 4264610 | 2851328 | 0.669 html | doaj | 2429003 | 122937 | 0.051 pdf | pmc | 2277417 | 1736491 | 0.762 pdf | arxiv | 2143549 | 2011378 | 0.938 xml | doaj | 9442 | 6897 | 0.730 html | doi | 3022 | 957 | 0.317 pdf | cnki_covid19 | 2034 | 0 | 0.000 pdf | wanfang_covid19 | 975 | 764 | 0.784 pdf | spn | 469 | 328 | 0.699 html | spn | 9 | 2 | 0.222 (14 rows) Ingest result by status: SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; ingest_type | status | count -------------+--------------------------------+---------- pdf | success | 66487928 pdf | no-pdf-link | 29279677 pdf | no-capture | 22765431 pdf | redirect-loop | 9155767 pdf | terminal-bad-status | 3549665 pdf | link-loop | 2592983 html | wrong-scope | 1088793 pdf | wrong-mimetype | 792563 pdf | gateway-timeout | 478181 html | no-capture | 423917 pdf | wayback-content-error | 355828 pdf | cdx-error | 343862 pdf | null-body | 328774 pdf | forbidden | 286647 pdf | spn2-cdx-lookup-failure | 276769 pdf | spn2-wayback-error | 276080 pdf | skip-url-blocklist | 265473 html | redirect-loop | 212916 pdf | not-found | 204367 html | unknown-scope | 204112 html | html-resource-no-capture | 166034 pdf | blocked-cookie | 160336 pdf | too-many-redirects | 152984 html | success | 123896 pdf | wayback-error | 114388 html | null-body | 100296 pdf | spn2-error:too-many-redirects | 58336 html | wayback-content-error | 53926 pdf | invalid-host-resolution | 37226 pdf | petabox-error | 37177 pdf | remote-server-error | 36439 pdf | spn2-error | 27556 pdf | spn2-error:proxy-error | 25486 pdf | read-timeout | 20745 html | wrong-mimetype | 18928 html | terminal-bad-status | 14059 html | petabox-error | 13533 pdf | bad-redirect | 7535 xml | success | 6897 html | cdx-error | 6823 pdf | spn2-error:bad-request | 4664 pdf | spn2-error:unauthorized | 4391 pdf | spn-remote-error | 4206 pdf | spn2-error:service-unavailable | 2614 pdf | spn2-error:job-failed | 2562 xml | null-body | 2353 pdf | other-mimetype | 2304 pdf | error | 1905 html | spn2-cdx-lookup-failure | 1018 pdf | redirects-exceeded | 1015 (50 rows) Failed ingest by terminal status code: SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; ingest_type | terminal_status_code | count -------------+----------------------+---------- pdf | 200 | 36515867 pdf | | 22909334 pdf | 301 | 7969702 html | 200 | 1653303 pdf | 503 | 928507 pdf | 403 | 823755 pdf | 302 | 792842 pdf | 400 | 462108 html | | 426474 pdf | 404 | 422163 pdf | 401 | 270611 pdf | 500 | 248675 html | 301 | 211713 pdf | 303 | 109686 pdf | 410 | 50648 pdf | 502 | 37663 pdf | 429 | 31982 pdf | 420 | 26603 pdf | 509 | 15113 pdf | 409 | 14835 html | 404 | 9573 pdf | 999 | 9296 pdf | 307 | 3972 pdf | 308 | 3914 html | 500 | 3625 pdf | 202 | 3515 xml | 200 | 2537 pdf | 520 | 2072 pdf | 206 | 1665 pdf | 521 | 1075 html | 302 | 1072 pdf | 504 | 1000 pdf | 412 | 476 pdf | 300 | 434 pdf | 505 | 429 pdf | 406 | 393 html | 403 | 382 html | 503 | 378 pdf | 421 | 298 html | 303 | 268 pdf | 508 | 195 pdf | 226 | 166 pdf | 402 | 70 html | 502 | 68 pdf | 408 | 50 pdf | 204 | 34 pdf | 416 | 29 pdf | 501 | 29 pdf | 530 | 27 pdf | 507 | 21 (50 rows) ## Fatcat Files Count of PDF files that GROBID processed and matched to a release (via glutton), but no PDF in `fatcat_file`: SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count FROM grobid LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex WHERE fatcat_file.sha1hex IS NULL AND grobid.fatcat_release IS NOT NULL; total_count | release_count -------------+--------------- 8514315 | 6401104 (1 row)