diff options
Diffstat (limited to 'sql/stats')
-rw-r--r-- | sql/stats/2020-07-23_stats.txt | 347 | ||||
-rw-r--r-- | sql/stats/2020-09-14_stats.txt | 340 | ||||
-rw-r--r-- | sql/stats/2021-04-07_stats.txt | 430 | ||||
-rw-r--r-- | sql/stats/2021-04-08_table_sizes.txt | 40 | ||||
-rw-r--r-- | sql/stats/2021-04-12_ingest_domain_summary_30d.txt | 345 | ||||
-rw-r--r-- | sql/stats/2021-11-01_table_sizes.txt | 19 | ||||
-rw-r--r-- | sql/stats/2021-11-26_stats.txt | 424 | ||||
-rw-r--r-- | sql/stats/2021-12-02_table_sizes.txt | 22 | ||||
-rw-r--r-- | sql/stats/2022-04-26_stats.txt | 432 | ||||
-rw-r--r-- | sql/stats/2022-04-27_crawl_changelog.txt | 191 | ||||
-rw-r--r-- | sql/stats/2022-05-11_crawl_changelog.txt | 410 | ||||
-rw-r--r-- | sql/stats/2022-09-06_stats.txt | 438 | ||||
-rw-r--r-- | sql/stats/2022-11-23_table_sizes.txt | 21 | ||||
-rw-r--r-- | sql/stats/README.md | 33 |
14 files changed, 3470 insertions, 22 deletions
diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt new file mode 100644 index 0000000..d1993fc --- /dev/null +++ b/sql/stats/2020-07-23_stats.txt @@ -0,0 +1,347 @@ + +Summary: + +- very many more PDFs have been grobid-ed vs. pdf_meta-ed +- about 1 million file_meta still have partial metadata (eg, no sha256) +- database size still under 0.5 TByte +- there are about a million CDX error ingest requests, and hundreds of + thousands of SPN errors which could be re-run + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 42 GB | 84 GB + "public"."ingest_request" | 34 GB | 39 GB | 73 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 61 GB | 7742 MB | 69 GB + "public"."file_meta" | 32 GB | 29 GB | 61 GB + "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + Size: 466.91G + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 161944425 | 204,402,677,360,189 + (1 row) + + # 161.9 mil; 204 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 161691608 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1015337 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96537611 | 116281981 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------+----------- + application/pdf | 108706978 + warc/revisit | 5912013 + text/xml | 519042 + application/octet-stream | 307782 + text/html | 295634 + unk | 156937 + application/postscript | 81079 + application/save | 80871 + binary/octet-stream | 61263 + text/plain | 31495 + application/x-download | 30511 + application/download | 26716 + image/pdf | 26357 + application/force-download | 10541 + multipart/form-data | 5551 + application/x-msdownload | 3724 + application/x-octetstream | 3216 + application | 3171 + .pdf | 2728 + application/x-pdf | 2563 + application/binary | 1306 + application/pdf' | 1192 + pdf | 1180 + [...] + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + + total_files | unique_releases + -------------+----------------- + 95557413 | 18020570 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + + status_code | count + -------------+---------- + 200 | 88450610 + 500 | 7101098 + -4 | 4133 + 503 | 110 + + SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10; + + status | count + ----------------+---------- + success | 73814297 + | 14638412 + error | 7101308 + error-timeout | 4133 + bad-grobid-xml | 6 + (5 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 73813427 + | 14638425 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 16589669 + pdf | pmc | 2231113 + pdf | arxiv | 794693 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 148 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8320832 + pdf | doi | fatcat-changelog | 4752956 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630750 + pdf | pmc | fatcat-ingest | 194781 + pdf | arxiv | fatcat-changelog | 163924 + pdf | pmc | fatcat-changelog | 7507 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 148 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 19 + pdf | arxiv | savepapernow-web | 2 + +Uncrawled requests by source: + + # TODO: verify this? seems wrong + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | mag | 4097008 + pdf | oai | 15287 + pdf | unpaywall | 1 + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 5346057 | 0.104 + pdf | mag | 35015357 | 22199271 | 0.634 + pdf | unpaywall | 27653003 | 22067338 | 0.798 + pdf | doi | 16589700 | 3207661 | 0.193 + pdf | pmc | 2231113 | 1696976 | 0.761 + pdf | arxiv | 794727 | 645607 | 0.812 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 148 | 114 | 0.770 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 46465271 + pdf | no-capture | 46115869 + pdf | no-pdf-link | 13877460 + pdf | redirect-loop | 5943956 + pdf | terminal-bad-status | 1962754 + pdf | link-loop | 1630078 + pdf | cdx-error | 1014409 + pdf | gateway-timeout | 459340 + pdf | wrong-mimetype | 321774 + pdf | skip-url-blocklist | 220629 + pdf | wayback-error | 220453 + pdf | spn2-cdx-lookup-failure | 143963 + pdf | null-body | 113384 + pdf | spn-error | 101773 + pdf | invalid-host-resolution | 37367 + pdf | spn-remote-error | 28886 + pdf | petabox-error | 22997 + pdf | spn2-error | 16342 + pdf | spn2-error:job-failed | 5017 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 746 + pdf | spn2-error:soft-time-limit-exceeded | 632 + pdf | spn2-error:proxy-error | 437 + pdf | spn2-error:invalid-url-syntax | 417 + pdf | timeout | 417 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + + total_count | release_count + -------------+--------------- + 5862666 | 4728824 + (1 row) + +## PDF Meta + +Total rows: + + SELECT COUNT(*) as total_count FROM pdf_meta; + + + total_count + ------------- + 21961874 + +By status: + + SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC; + + status | count + ----------------+---------- + success | 21788507 + parse-error | 78196 + text-too-large | 60595 + not-pdf | 31679 + error-wayback | 2639 + bad-unicode | 251 + bad-pdf | 6 + empty-blob | 1 + (8 rows) + diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 44 GB | 45 GB | 89 GB + "public"."grobid" | 66 GB | 8127 MB | 74 GB + "public"."ingest_request" | 34 GB | 40 GB | 73 GB + "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."file_meta" | 33 GB | 30 GB | 63 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 167021210 | 221982345333674 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 166765214 + application/octet-stream | 155517 + application/xml | 42170 + text/html | 18708 + text/plain | 15990 + application/gzip | 6491 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62960 + (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 102123051 | 126550160 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ----------------------------+----------- + application/pdf | 116885565 + warc/revisit | 7951816 + text/xml | 519042 + application/octet-stream | 327639 + text/html | 295725 + unk | 172491 + application/postscript | 81095 + application/save | 80900 + binary/octet-stream | 61783 + text/plain | 33684 + image/pdf | 32856 + application/x-download | 32418 + application/download | 27672 + application/force-download | 10892 + multipart/form-data | 5750 + application/x-msdownload | 3832 + application/x-octetstream | 3516 + application | 3499 + .pdf | 3038 + application/x-pdf | 2701 + application/binary | 1322 + pdf | 1232 + file/unknown | 1199 + application/pdf' | 1192 + file | 979 + (25 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 101494314 | 18919012 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 93730358 + 500 | 7759103 + -4 | 4683 + 503 | 150 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 80838234 + | 12892145 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 17362763 + pdf | pmc | 2248854 + pdf | arxiv | 835400 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 197 + (9 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8399261 + pdf | doi | fatcat-changelog | 5449349 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 634665 + pdf | pmc | fatcat-ingest | 210453 + pdf | arxiv | fatcat-changelog | 200707 + pdf | pmc | fatcat-changelog | 9582 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 197 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 21 + pdf | arxiv | savepapernow-web | 2 + (17 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 170304 + pdf | oai | 15287 + pdf | unpaywall | 1 + (3 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14144314 | 0.276 + pdf | mag | 35015357 | 24811947 | 0.709 + pdf | unpaywall | 27653003 | 22302629 | 0.807 + pdf | doi | 17363369 | 3533568 | 0.204 + pdf | pmc | 2248860 | 1713197 | 0.762 + pdf | arxiv | 835400 | 685219 | 0.820 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 197 | 138 | 0.701 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 58265365 + pdf | no-pdf-link | 27216435 + pdf | no-capture | 21982611 + pdf | redirect-loop | 8457469 + pdf | terminal-bad-status | 2695023 + pdf | link-loop | 2209672 + pdf | wrong-mimetype | 767508 + pdf | gateway-timeout | 548870 + pdf | cdx-error | 391611 + pdf | skip-url-blocklist | 220661 + pdf | null-body | 182215 + pdf | wayback-error | 146869 + pdf | spn2-cdx-lookup-failure | 107229 + pdf | spn-error | 85128 + pdf | invalid-host-resolution | 37352 + pdf | petabox-error | 32490 + pdf | spn2-error | 29212 + pdf | spn-remote-error | 27927 + pdf | other-mimetype | 2305 + pdf | bad-redirect | 1524 + pdf | spn2-error:job-failed | 1521 + pdf | timeout | 842 + pdf | spn2-error:soft-time-limit-exceeded | 793 + pdf | redirects-exceeded | 748 + pdf | spn2-error:invalid-url-syntax | 417 + (25 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) + FROM ingest_file_result + WHERE hit = false + GROUP BY ingest_type, terminal_status_code + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 34064937 + pdf | | 20514531 + pdf | 301 | 7271700 + pdf | 302 | 720632 + pdf | 503 | 712697 + pdf | 400 | 444209 + pdf | 404 | 331495 + pdf | 403 | 323030 + pdf | 401 | 259327 + pdf | 500 | 236122 + pdf | 303 | 101609 + pdf | 429 | 47738 + pdf | 502 | 36183 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14790 + pdf | 999 | 8996 + pdf | 307 | 3769 + pdf | 308 | 3422 + pdf | 202 | 3228 + pdf | 520 | 2058 + pdf | 410 | 1734 + pdf | 521 | 1033 + pdf | 504 | 868 + pdf | 505 | 424 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 6600758 | 5213294 + (1 row) + diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt new file mode 100644 index 0000000..6811b54 --- /dev/null +++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt @@ -0,0 +1,345 @@ + domain | status | count +---------------------------------------+-------------------------+-------- + academic.oup.com | | 4105 + academic.oup.com | spn2-wayback-error | 1393 + academic.oup.com | link-loop | 1025 + academic.oup.com | no-pdf-link | 1020 + academic.oup.com | spn2-cdx-lookup-failure | 512 + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + acp.copernicus.org | | 620 + acp.copernicus.org | success | 537 + aip.scitation.org | | 1310 + aip.scitation.org | blocked-cookie | 1192 + alustath.uobaghdad.edu.iq | | 697 + alustath.uobaghdad.edu.iq | success | 550 + apex.ipk-gatersleben.de | | 1253 + apex.ipk-gatersleben.de | no-pdf-link | 1132 + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + arxiv.org | | 14990 + arxiv.org | success | 12899 + arxiv.org | spn2-wayback-error | 1592 + ashpublications.org | | 563 + asmedigitalcollection.asme.org | | 3990 + asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570 + asmedigitalcollection.asme.org | no-pdf-link | 1449 + asmedigitalcollection.asme.org | link-loop | 734 + assets.researchsquare.com | | 8217 + assets.researchsquare.com | success | 7116 + assets.researchsquare.com | spn2-wayback-error | 946 + av.tib.eu | | 526 + bioone.org | | 588 + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + boris.unibe.ch | | 1420 + boris.unibe.ch | success | 743 + brill.com | | 1773 + brill.com | link-loop | 879 + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + classiques-garnier.com | | 1072 + classiques-garnier.com | success | 807 + content.iospress.com | | 793 + content.iospress.com | link-loop | 568 + cyberdoi.ru | | 775 + cyberdoi.ru | redirect-loop | 775 + cyberleninka.ru | | 1453 + cyberleninka.ru | success | 1092 + d197for5662m48.cloudfront.net | | 632 + d197for5662m48.cloudfront.net | success | 544 + dergipark.org.tr | | 3070 + dergipark.org.tr | success | 1251 + dergipark.org.tr | no-pdf-link | 843 + dergipark.org.tr | spn2-wayback-error | 677 + digi.ub.uni-heidelberg.de | | 502 + dione.lib.unipi.gr | | 783 + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + dl.acm.org | | 1692 + dl.acm.org | blocked-cookie | 1558 + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + doi.nrct.go.th | | 566 + doi.org | | 10408 + doi.org | spn2-cdx-lookup-failure | 9593 + doi.org | terminal-bad-status | 741 + downloads.hindawi.com | | 2137 + downloads.hindawi.com | success | 1787 + dram.journals.ekb.eg | | 541 + elib.spbstu.ru | | 1243 + elib.spbstu.ru | redirect-loop | 1214 + elibrary.vdi-verlag.de | | 1542 + elibrary.vdi-verlag.de | spn2-wayback-error | 721 + elifesciences.org | | 689 + elifesciences.org | success | 521 + epos.myesr.org | | 705 + epos.myesr.org | spn2-wayback-error | 604 + europepmc.org | | 6996 + europepmc.org | success | 6031 + europepmc.org | spn2-wayback-error | 756 + figshare.com | | 1168 + figshare.com | no-pdf-link | 726 + files.osf.io | | 1526 + files.osf.io | success | 1078 + fjfsdata01prod.blob.core.windows.net | | 5410 + fjfsdata01prod.blob.core.windows.net | success | 4581 + fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587 + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + hcommons.org | | 1593 + hcommons.org | success | 1333 + hkvalidate.perfdrive.com | | 1322 + hkvalidate.perfdrive.com | no-pdf-link | 1083 + ieeexplore.ieee.org | | 20997 + ieeexplore.ieee.org | too-many-redirects | 15383 + ieeexplore.ieee.org | spn2-wayback-error | 2555 + ieeexplore.ieee.org | success | 2165 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747 + jamanetwork.com | | 712 + journals.aps.org | | 1698 + journals.aps.org | not-found | 1469 + journals.library.ualberta.ca | | 733 + journals.library.ualberta.ca | success | 594 + journals.lww.com | | 6606 + journals.lww.com | link-loop | 3102 + journals.lww.com | spn2-wayback-error | 1645 + journals.lww.com | terminal-bad-status | 965 + journals.lww.com | spn2-cdx-lookup-failure | 552 + journals.openedition.org | | 4594 + journals.openedition.org | success | 1441 + journals.openedition.org | redirect-loop | 1316 + journals.openedition.org | spn2-wayback-error | 1197 + journals.ub.uni-heidelberg.de | | 1039 + journals.ub.uni-heidelberg.de | success | 728 + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + library.iated.org | | 1560 + library.iated.org | redirect-loop | 1148 + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + mr.crossref.org | | 542 + nsuworks.nova.edu | | 843 + nsuworks.nova.edu | success | 746 + ojs.cvut.cz | | 805 + ojs.cvut.cz | success | 764 + ojs.ugent.be | | 867 + ojs.ugent.be | success | 643 + onepetro.org | | 603 + onlinelibrary.wiley.com | | 1203 + onlinelibrary.wiley.com | blocked-cookie | 758 + open.library.ubc.ca | | 559 + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + oxford.universitypressscholarship.com | | 3556 + oxford.universitypressscholarship.com | link-loop | 2373 + oxford.universitypressscholarship.com | spn2-wayback-error | 562 + painphysicianjournal.com | | 804 + painphysicianjournal.com | success | 668 + papers.ssrn.com | | 6367 + papers.ssrn.com | link-loop | 3865 + papers.ssrn.com | spn2-wayback-error | 1106 + papers.ssrn.com | spn2-cdx-lookup-failure | 1015 + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + pos.sissa.it | | 1455 + pos.sissa.it | success | 1153 + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + publikationen.uni-tuebingen.de | | 659 + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + pubs.acs.org | | 5860 + pubs.acs.org | blocked-cookie | 5185 + pubs.rsc.org | | 2269 + pubs.rsc.org | link-loop | 1384 + res.mdpi.com | | 15776 + res.mdpi.com | success | 13710 + res.mdpi.com | spn2-wayback-error | 1424 + res.mdpi.com | spn2-cdx-lookup-failure | 641 + rrs.scholasticahq.com | | 1078 + rrs.scholasticahq.com | success | 803 + rsdjournal.org | | 755 + rsdjournal.org | success | 524 + s3-eu-west-1.amazonaws.com | | 3343 + s3-eu-west-1.amazonaws.com | success | 2893 + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + sage.figshare.com | | 725 + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + scholarworks.umass.edu | | 1196 + scholarworks.umass.edu | success | 713 + secure.jbs.elsevierhealth.com | | 4202 + secure.jbs.elsevierhealth.com | blocked-cookie | 4169 + storage.googleapis.com | | 1720 + storage.googleapis.com | success | 1466 + tandf.figshare.com | | 789 + tandf.figshare.com | no-pdf-link | 640 + tind-customer-agecon.s3.amazonaws.com | | 584 + turcomat.org | | 1196 + turcomat.org | spn2-wayback-error | 997 + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + utpjournals.press | | 669 + utpjournals.press | blocked-cookie | 616 + watermark.silverchair.com | | 3560 + watermark.silverchair.com | success | 2788 + watermark.silverchair.com | spn2-wayback-error | 685 + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + www.ahajournals.org | | 802 + www.ahajournals.org | blocked-cookie | 597 + www.ajol.info | | 830 + www.ajol.info | success | 575 + www.ams.org | | 868 + www.ams.org | terminal-bad-status | 666 + www.atlantis-press.com | | 1579 + www.atlantis-press.com | success | 1071 + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + www.brazilianjournals.com | | 1385 + www.brazilianjournals.com | success | 1107 + www.cairn.info | | 2479 + www.cairn.info | no-pdf-link | 818 + www.cairn.info | link-loop | 790 + www.cambridge.org | | 6801 + www.cambridge.org | no-pdf-link | 2990 + www.cambridge.org | spn2-wayback-error | 1475 + www.cambridge.org | link-loop | 940 + www.cambridge.org | success | 863 + www.cureus.com | | 538 + www.dbpia.co.kr | | 2958 + www.dbpia.co.kr | redirect-loop | 2953 + www.degruyter.com | | 58612 + www.degruyter.com | no-pdf-link | 41065 + www.degruyter.com | spn2-wayback-error | 7426 + www.degruyter.com | success | 6628 + www.degruyter.com | spn2-cdx-lookup-failure | 1624 + www.degruyter.com | terminal-bad-status | 1565 + www.dovepress.com | | 869 + www.dovepress.com | success | 597 + www.e-manuscripta.ch | | 1047 + www.e3s-conferences.org | | 817 + www.e3s-conferences.org | success | 606 + www.elgaronline.com | | 535 + www.elibrary.ru | | 1244 + www.elibrary.ru | no-pdf-link | 1159 + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + www.eurekaselect.com | | 540 + www.eurosurveillance.org | | 786 + www.eurosurveillance.org | success | 710 + www.finersistemas.com | | 1220 + www.finersistemas.com | success | 1214 + www.frontiersin.org | | 915 + www.frontiersin.org | spn2-wayback-error | 602 + www.hanspub.org | | 618 + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + www.ijcmas.com | | 513 + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + www.indianjournals.com | | 904 + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + www.journals.uchicago.edu | | 6055 + www.journals.uchicago.edu | blocked-cookie | 5927 + www.journals.vu.lt | | 791 + www.journals.vu.lt | success | 545 + www.jstage.jst.go.jp | | 1490 + www.jstage.jst.go.jp | remote-server-error | 1023 + www.jstor.org | | 1103 + www.jstor.org | redirect-loop | 553 + www.karger.com | | 733 + www.liebertpub.com | | 804 + www.liebertpub.com | blocked-cookie | 714 + www.liverpooluniversitypress.co.uk | | 620 + www.liverpooluniversitypress.co.uk | too-many-redirects | 529 + www.mdpi.com | | 3880 + www.mdpi.com | spn2-wayback-error | 1651 + www.mdpi.com | forbidden | 1282 + www.mdpi.com | spn2-cdx-lookup-failure | 714 + www.nepjol.info | | 596 + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + www.osti.gov | | 1147 + www.osti.gov | link-loop | 902 + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + www.preprints.org | | 783 + www.preprints.org | success | 595 + www.repository.cam.ac.uk | | 1146 + www.research-collection.ethz.ch | | 704 + www.research-collection.ethz.ch | terminal-bad-status | 684 + www.researchsquare.com | | 853 + www.researchsquare.com | spn2-wayback-error | 515 + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + www.scielo.br | | 1777 + www.scielo.br | success | 1167 + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + www.scilook.eu | | 812 + www.scilook.eu | success | 563 + www.scirp.org | | 749 + www.tandfonline.com | | 11038 + www.tandfonline.com | blocked-cookie | 9994 + www.tandfonline.com | no-pdf-link | 663 + www.taylorfrancis.com | | 71514 + www.taylorfrancis.com | spn2-wayback-error | 36663 + www.taylorfrancis.com | no-pdf-link | 15098 + www.taylorfrancis.com | forbidden | 8699 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894 + www.taylorfrancis.com | link-loop | 3661 + www.thieme-connect.de | | 3687 + www.thieme-connect.de | redirect-loop | 1187 + www.thieme-connect.de | not-found | 945 + www.thieme-connect.de | no-pdf-link | 941 + www.worldscientific.com | | 1476 + www.worldscientific.com | blocked-cookie | 1323 + www.zora.uzh.ch | | 1118 + zenodo.org | | 43010 + zenodo.org | no-pdf-link | 22015 + zenodo.org | success | 12747 + zenodo.org | spn2-wayback-error | 4608 + zenodo.org | spn2-cdx-lookup-failure | 3215 + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 +(341 rows) + diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt new file mode 100644 index 0000000..57f7e57 --- /dev/null +++ b/sql/stats/2021-11-01_table_sizes.txt @@ -0,0 +1,19 @@ + +Size: 832.66G + + table_name | table_size | indexes_size | total_size +-------------------------------+------------+--------------+------------ + "public"."crossref" | 311 GB | 9812 MB | 320 GB + "public"."ingest_request" | 44 GB | 40 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6952 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 54 GB + "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB +(13 rows) diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt new file mode 100644 index 0000000..3a0e561 --- /dev/null +++ b/sql/stats/2021-11-26_stats.txt @@ -0,0 +1,424 @@ + +Date: Sat 27 Nov 2021 03:33:30 AM UTC + +## SQL Table Sizes + + Size: 937.28G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 393 GB | 10127 MB | 403 GB + "public"."ingest_request" | 44 GB | 41 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6963 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 179761501 | 244453538203113 + + # 179m files, 244 TB + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 179376819 + application/octet-stream | 155379 + text/html | 116102 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 794 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303 + application/x-compress | 272 + application/zip | 131 + image/png | 121 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + video/mp4 | 29 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62196 + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 119049962 | 149169240 + + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 137271670 + warc/revisit | 9709493 + application/octet-stream | 590443 + text/xml | 525481 + text/html | 421030 + unk | 207442 + application/postscript | 81123 + application/save | 80988 + binary/octet-stream | 67476 + image/pdf | 39419 + application/x-download | 38278 + text/plain | 36159 + application/download | 34328 + application/force-download | 19729 + multipart/form-data | 9105 + application | 5299 + application/x-msdownload | 3851 + application/x-octetstream | 3649 + .pdf | 3318 + application/x-pdf | 2992 + pdf | 1484 + file | 1364 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1029 + application/unknown | 1000 + 0 | 764 + text/pdf | 704 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 111236904 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 102962304 + 500 | 8269129 + -4 | 5013 + 503 | 548 + +TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF +files through by mistake + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.5.5-fatcat | 89983404 + | 12892161 + 0.7.0-104-gbeebd9a6b | 86739 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 43701948 + pdf | unpaywall | 37802895 + pdf | doi | 28736398 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2383398 + pdf | arxiv | 2330054 + html | doi | 39725 + xml | doaj | 9442 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 689 + html | spn | 48 + xml | spn | 1 + (15 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 43701948 + pdf | unpaywall | unpaywall | 37802895 + pdf | doi | fatcat-changelog | 16207728 + pdf | doi | fatcat-ingest | 9012282 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767705 + pdf | arxiv | fatcat-changelog | 562320 + pdf | pmc | fatcat-ingest | 297527 + pdf | pmc | fatcat-changelog | 57046 + html | doi | fatcat-ingest | 37788 + xml | doaj | doaj | 9442 + pdf | cnki_covid19 | scrape-covid19 | 2034 + html | doi | fatcat-changelog | 1897 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 689 + pdf | doi | savepapernow-web | 613 + html | spn | savepapernow-web | 48 + html | doi | savepapernow-web | 40 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | spn | savepapernow-web | 1 + (25 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 169076 + pdf | oai | 15283 + pdf | doaj | 2063 + html | doaj | 620 + pdf | doi | 22 + pdf | unpaywall | 17 + + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14554221 | 0.284 + pdf | mag | 43701948 | 32643175 | 0.747 + pdf | unpaywall | 37802895 | 29989257 | 0.793 + pdf | doi | 28736547 | 7690393 | 0.268 + pdf | doaj | 4264610 | 2851601 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2383398 | 1821071 | 0.764 + pdf | arxiv | 2330054 | 2159738 | 0.927 + html | doi | 39725 | 1235 | 0.031 + xml | doaj | 9442 | 6897 | 0.730 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 689 | 503 | 0.730 + html | spn | 48 | 5 | 0.104 + xml | spn | 1 | 0 | 0.000 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 78944243 + pdf | no-pdf-link | 26270027 + pdf | no-capture | 23267156 + pdf | redirect-loop | 9837466 + pdf | terminal-bad-status | 4147454 + pdf | skip-url-blocklist | 3088907 + pdf | link-loop | 2953891 + pdf | blocked-cookie | 1855541 + html | wrong-scope | 1106171 + pdf | wrong-mimetype | 859941 + pdf | gateway-timeout | 729771 + pdf | spn2-cdx-lookup-failure | 584856 + html | no-capture | 423917 + pdf | forbidden | 390804 + pdf | cdx-error | 363091 + pdf | wayback-content-error | 354894 + pdf | null-body | 341698 + pdf | too-many-redirects | 307096 + pdf | not-found | 294592 + html | redirect-loop | 213032 + html | unknown-scope | 207923 + pdf | spn2-error | 192046 + html | html-resource-no-capture | 166119 + html | success | 124177 + pdf | wayback-error | 105385 + html | null-body | 100296 + pdf | spn2-wayback-error | 73176 + pdf | remote-server-error | 60908 + pdf | spn2-error:too-many-redirects | 58076 + pdf | skip-wall | 57744 + html | wayback-content-error | 53928 + pdf | read-timeout | 42465 + pdf | invalid-host-resolution | 37221 + pdf | petabox-error | 28765 + pdf | spn2-error:unknown | 23885 + html | wrong-mimetype | 18930 + pdf | bad-redirect | 14708 + html | terminal-bad-status | 14070 + html | petabox-error | 13770 + html | spn2-cdx-lookup-failure | 13002 + pdf | spn2-error:job-failed | 9721 + html | cdx-error | 7167 + xml | success | 6897 + pdf | spn2-error:bad-request | 4433 + pdf | spn-remote-error | 4206 + pdf | body-too-large | 3019 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1900 + pdf | spn2-error:proxy-error | 1850 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36821458 + pdf | | 26058729 + pdf | 301 | 8466302 + html | 200 | 1676730 + pdf | 503 | 1028504 + pdf | 302 | 949465 + pdf | 403 | 936737 + pdf | 404 | 687661 + pdf | 400 | 507303 + html | | 439356 + pdf | 401 | 288994 + pdf | 500 | 263775 + html | 301 | 211796 + pdf | 303 | 130719 + pdf | 410 | 66495 + pdf | 502 | 41760 + pdf | 429 | 35266 + pdf | 420 | 26722 + pdf | 409 | 15204 + pdf | 509 | 15113 + pdf | 999 | 11409 + html | 404 | 9578 + pdf | 307 | 8404 + pdf | 308 | 5514 + pdf | 202 | 4724 + html | 500 | 3628 + xml | 200 | 2537 + pdf | 520 | 2199 + pdf | 206 | 1694 + html | 302 | 1138 + pdf | 504 | 1124 + pdf | 521 | 1085 + pdf | 412 | 921 + pdf | 421 | 714 + pdf | 300 | 461 + pdf | 505 | 436 + pdf | 406 | 427 + pdf | 508 | 408 + html | 403 | 382 + html | 503 | 378 + html | 303 | 268 + pdf | 204 | 252 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 523 | 55 + pdf | 408 | 53 + pdf | 432 | 45 + pdf | 530 | 31 + pdf | 416 | 31 + (50 rows) diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt new file mode 100644 index 0000000..b03c370 --- /dev/null +++ b/sql/stats/2021-12-02_table_sizes.txt @@ -0,0 +1,22 @@ + +Size: 940.66G + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 394 GB | 10138 MB | 404 GB + "public"."ingest_request" | 44 GB | 41 GB | 85 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6978 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes +(16 rows) diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt new file mode 100644 index 0000000..bd20c5c --- /dev/null +++ b/sql/stats/2022-04-26_stats.txt @@ -0,0 +1,432 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 416 GB | 10 GB | 426 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 58 GB | 41 GB | 99 GB + "public"."ingest_request" | 50 GB | 48 GB | 98 GB + "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 37 GB | 34 GB | 71 GB + "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB + "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 3015 MB | 31 MB | 3046 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 192402128 | 271919997557597 + (1 row) + + # 271,919,997,557,597 -> ~272 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 191760695 + text/html | 330351 + application/octet-stream | 186696 + application/xml | 42170 + application/xhtml+xml | 31470 + text/plain | 16449 + application/jats+xml | 6902 + application/gzip | 6681 + | 6033 + application/postscript | 4916 + image/jpeg | 2901 + application/vnd.ms-powerpoint | 1672 + application/msword | 934 + application/x-bzip2 | 891 + image/png | 476 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294 + application/x-compress | 274 + video/mp4 | 150 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12831 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 130732381 | 162760251 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 149749828 + warc/revisit | 10437210 + application/octet-stream | 733161 + text/html | 642992 + text/xml | 525483 + unk | 217642 + application/postscript | 81127 + application/save | 81023 + binary/octet-stream | 67938 + application/x-download | 41137 + image/pdf | 39712 + application/download | 37153 + text/plain | 36342 + application/force-download | 21496 + multipart/form-data | 9792 + application | 5366 + application/x-octetstream | 5166 + application/x-msdownload | 3851 + .pdf | 3445 + application/x-pdf | 3018 + pdf | 1618 + file | 1370 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1047 + application/unknown | 1001 + 0 | 773 + text/pdf | 729 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 123669603 + (1 row) + + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 115668412 + 500 | 7995428 + -4 | 5745 + 503 | 18 + (4 rows) + + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 54780825 + 0.5.5-fatcat | 48003940 + | 12694404 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | mag | 43701948 + pdf | doi | 40044585 + pdf | doaj | 6016771 + html | doaj | 3648181 + pdf | arxiv | 2676200 + pdf | pmc | 2402453 + html | doi | 41492 + xml | doaj | 20638 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 829 + html | spn | 52 + xml | doi | 1 + xml | spn | 1 + (16 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 20936949 + pdf | doi | fatcat-ingest | 15590201 + pdf | doaj | doaj | 6016771 + html | doaj | doaj | 3648181 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 691405 + pdf | pmc | fatcat-ingest | 297646 + pdf | pmc | fatcat-changelog | 75982 + html | doi | fatcat-ingest | 37904 + xml | doaj | doaj | 20638 + html | doi | fatcat-changelog | 3534 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1562 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 829 + html | doi | savepapernow-web | 54 + html | spn | savepapernow-web | 52 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | doi | savepapernow-web | 1 + xml | spn | savepapernow-web | 1 + (26 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doaj | 1619621 + html | doaj | 1208412 + pdf | mag | 167653 + pdf | oai | 15282 + xml | doaj | 11196 + pdf | unpaywall | 270 + pdf | doi | 22 + (7 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 15968290 | 0.312 + pdf | unpaywall | 43932525 | 32618045 | 0.742 + pdf | mag | 43701948 | 32662926 | 0.747 + pdf | doi | 40044738 | 10925369 | 0.273 + pdf | doaj | 6016771 | 3042569 | 0.506 + html | doaj | 3648181 | 344208 | 0.094 + pdf | arxiv | 2676206 | 2269708 | 0.848 + pdf | pmc | 2402453 | 1855679 | 0.772 + html | doi | 41492 | 1739 | 0.042 + xml | doaj | 20638 | 6899 | 0.334 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 829 | 616 | 0.743 + html | spn | 52 | 7 | 0.135 + xml | doi | 1 | 0 | 0.000 + xml | spn | 1 | 0 | 0.000 + (16 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+---------------------------------+---------- + pdf | success | 85709322 + pdf | no-pdf-link | 29713304 + pdf | no-capture | 26632191 + pdf | redirect-loop | 10979145 + pdf | terminal-bad-status | 4977000 + pdf | link-loop | 3434877 + pdf | skip-url-blocklist | 3114258 + pdf | blocked-cookie | 2156835 + html | wrong-scope | 1126911 + pdf | wrong-mimetype | 980546 + pdf | gateway-timeout | 651562 + pdf | spn2-cdx-lookup-failure | 484016 + pdf | spn2-backoff | 399382 + pdf | cdx-error | 373964 + pdf | wayback-content-error | 354370 + html | success | 345860 + pdf | null-body | 336182 + pdf | spn2-error:500 | 309755 + pdf | forbidden | 291175 + pdf | not-found | 275560 + pdf | too-many-redirects | 262312 + html | unknown-scope | 230352 + html | redirect-loop | 226596 + html | html-resource-no-capture | 205646 + html | no-capture | 164014 + component | spn2-cdx-lookup-failure | 148825 + component | wrong-mimetype | 130344 + html | null-body | 100296 + pdf | wayback-error | 94286 + pdf | spn2-wayback-error | 81365 + component | no-capture | 75278 + pdf | spn2-error | 69830 + pdf | skip-wall | 57744 + pdf | spn2-error:too-many-redirects | 53808 + pdf | remote-server-error | 41286 + pdf | petabox-error | 38800 + pdf | invalid-host-resolution | 37337 + pdf | read-timeout | 36872 + component | spn2-backoff | 33217 + pdf | empty-blob | 27946 + component | spn2-error | 24078 + pdf | spn2-error:unknown | 23697 + component | gateway-timeout | 23139 + html | wrong-mimetype | 22731 + html | wayback-content-error | 20507 + pdf | spn2-error:host-crawling-paused | 19900 + pdf | bad-redirect | 19183 + html | terminal-bad-status | 13354 + component | blocked-cookie | 12287 + component | spn2-error:500 | 11271 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 38144779 + pdf | | 32762240 + pdf | 301 | 9433087 + html | 200 | 1716127 + pdf | 403 | 1416632 + pdf | 302 | 1134668 + pdf | 404 | 888853 + pdf | 401 | 746311 + pdf | 503 | 655894 + pdf | 400 | 531479 + component | | 337603 + pdf | 500 | 247944 + html | 301 | 224237 + html | | 167194 + pdf | 303 | 135048 + component | 200 | 130663 + pdf | 429 | 93489 + pdf | 410 | 67392 + pdf | 420 | 26722 + pdf | 502 | 18770 + pdf | 409 | 15152 + pdf | 509 | 15113 + pdf | 999 | 11747 + html | 404 | 9879 + pdf | 307 | 8895 + pdf | 412 | 7053 + pdf | 308 | 6627 + pdf | 202 | 5289 + xml | 200 | 2540 + html | 500 | 2480 + pdf | 520 | 2220 + pdf | 521 | 1844 + pdf | 206 | 1739 + html | 302 | 1407 + pdf | 504 | 1146 + html | 303 | 1123 + pdf | 421 | 986 + pdf | 406 | 938 + pdf | 204 | 498 + pdf | 505 | 468 + pdf | 300 | 436 + pdf | 508 | 422 + pdf | 426 | 405 + html | 429 | 402 + html | 403 | 398 + pdf | 432 | 366 + component | 301 | 294 + pdf | 405 | 210 + pdf | 226 | 166 + component | 302 | 128 + (50 rows) + diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt new file mode 100644 index 0000000..864abd4 --- /dev/null +++ b/sql/stats/2022-04-27_crawl_changelog.txt @@ -0,0 +1,191 @@ + domain | status | count +--------------------------------------+-------------------------+-------- + academic.oup.com | | 1243 + academic.oup.com | spn2-cdx-lookup-failure | 990 + aip.scitation.org | | 313 + aip.scitation.org | spn2-cdx-lookup-failure | 224 + ajps.uomustansiriyah.edu.iq | | 235 + apps.crossref.org | | 1329 + apps.crossref.org | spn2-cdx-lookup-failure | 942 + apps.crossref.org | no-pdf-link | 387 + archaeologydataservice.ac.uk | | 422 + archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289 + arxiv.org | | 3512 + arxiv.org | spn2-cdx-lookup-failure | 2319 + arxiv.org | success | 1177 + assets.researchsquare.com | | 571 + assets.researchsquare.com | spn2-cdx-lookup-failure | 322 + assets.researchsquare.com | success | 249 + brill.com | | 397 + brill.com | spn2-cdx-lookup-failure | 265 + cla.berkeley.edu | | 239 + classiques-garnier.com | | 249 + cyberleninka.ru | | 340 + cyberleninka.ru | spn2-cdx-lookup-failure | 244 + dergipark.org.tr | | 468 + dergipark.org.tr | spn2-cdx-lookup-failure | 333 + dl.acm.org | | 592 + dl.acm.org | spn2-cdx-lookup-failure | 470 + doi.ala.org.au | | 288 + doi.ala.org.au | spn2-cdx-lookup-failure | 220 + doi.org | | 1107 + doi.org | terminal-bad-status | 679 + doi.org | spn2-cdx-lookup-failure | 415 + downloads.hindawi.com | | 279 + downloads.hindawi.com | success | 267 + edbs.uomustansiriyah.edu.iq | | 294 + edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209 + elibrary.kdpu.edu.ua | | 320 + elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233 + elibrary.ru | | 722 + elibrary.ru | spn2-cdx-lookup-failure | 505 + europepmc.org | | 986 + europepmc.org | spn2-cdx-lookup-failure | 681 + europepmc.org | success | 291 + figshare.com | | 377 + figshare.com | spn2-cdx-lookup-failure | 328 + fjfsdata01prod.blob.core.windows.net | | 255 + fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216 + hammer.purdue.edu | | 224 + ieeexplore.ieee.org | | 3904 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654 + ieeexplore.ieee.org | gateway-timeout | 792 + ieeexplore.ieee.org | spn2-backoff | 419 + journals.eco-vector.com | | 428 + journals.eco-vector.com | spn2-cdx-lookup-failure | 306 + journals.lww.com | | 727 + journals.lww.com | spn2-cdx-lookup-failure | 622 + journals.openedition.org | | 806 + journals.openedition.org | spn2-cdx-lookup-failure | 554 + journals.plos.org | | 348 + journals.plos.org | spn2-cdx-lookup-failure | 244 + kiss.kstudy.com | | 226 + kluwerlawonline.com | | 723 + kluwerlawonline.com | spn2-cdx-lookup-failure | 489 + kluwerlawonline.com | link-loop | 203 + linkinghub.elsevier.com | | 401 + linkinghub.elsevier.com | spn2-backoff | 342 + mdpi-res.com | | 1463 + mdpi-res.com | success | 1337 + muse.jhu.edu | | 346 + muse.jhu.edu | spn2-cdx-lookup-failure | 253 + onepetro.org | | 363 + onepetro.org | spn2-cdx-lookup-failure | 284 + online.ucpress.edu | | 1620 + online.ucpress.edu | spn2-cdx-lookup-failure | 1511 + onlinelibrary.wiley.com | | 2913 + onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109 + onlinelibrary.wiley.com | terminal-bad-status | 787 + opendata.uni-halle.de | | 519 + opendata.uni-halle.de | spn2-cdx-lookup-failure | 343 + osf.io | | 1554 + osf.io | spn2-cdx-lookup-failure | 1350 + papers.ssrn.com | | 2207 + papers.ssrn.com | spn2-cdx-lookup-failure | 1727 + papers.ssrn.com | link-loop | 457 + psycharchives.org | | 384 + psycharchives.org | spn2-cdx-lookup-failure | 283 + publons.com | | 493 + publons.com | spn2-cdx-lookup-failure | 348 + pubs.acs.org | | 1240 + pubs.acs.org | spn2-cdx-lookup-failure | 881 + pubs.acs.org | terminal-bad-status | 298 + pubs.rsc.org | | 603 + pubs.rsc.org | spn2-cdx-lookup-failure | 460 + repositories.lib.utexas.edu | | 1861 + repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288 + repositories.lib.utexas.edu | terminal-bad-status | 523 + s3-eu-west-1.amazonaws.com | | 216 + sage.figshare.com | | 374 + sage.figshare.com | spn2-cdx-lookup-failure | 309 + scholar.dkyobobook.co.kr | | 220 + scholarworks.gsu.edu | | 749 + scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577 + tandf.figshare.com | | 214 + www.atlantis-press.com | | 338 + www.atlantis-press.com | spn2-cdx-lookup-failure | 214 + www.cairn.info | | 782 + www.cairn.info | spn2-cdx-lookup-failure | 541 + www.cambridge.org | | 2325 + www.cambridge.org | spn2-cdx-lookup-failure | 1787 + www.cambridge.org | no-pdf-link | 300 + www.cell.com | | 213 + www.concrete.org | | 476 + www.concrete.org | spn2-cdx-lookup-failure | 340 + www.dbpia.co.kr | | 375 + www.dbpia.co.kr | spn2-cdx-lookup-failure | 275 + www.degruyter.com | | 3849 + www.degruyter.com | spn2-cdx-lookup-failure | 2969 + www.degruyter.com | no-pdf-link | 712 + www.dib.ie | | 1100 + www.dib.ie | spn2-cdx-lookup-failure | 1038 + www.e-periodica.ch | | 821 + www.e-periodica.ch | spn2-cdx-lookup-failure | 620 + www.e-periodica.ch | no-pdf-link | 201 + www.elibrary.ru | | 401 + www.elibrary.ru | spn2-cdx-lookup-failure | 281 + www.emerald.com | | 390 + www.emerald.com | spn2-cdx-lookup-failure | 275 + www.eurekaselect.com | | 275 + www.frontiersin.org | | 1266 + www.frontiersin.org | spn2-cdx-lookup-failure | 1025 + www.hanspub.org | | 229 + www.hindawi.com | | 604 + www.hindawi.com | spn2-cdx-lookup-failure | 594 + www.inderscience.com | | 201 + www.jstage.jst.go.jp | | 1094 + www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807 + www.jstage.jst.go.jp | success | 206 + www.mdpi.com | | 4340 + www.mdpi.com | spn2-cdx-lookup-failure | 4258 + www.nomos-elibrary.de | | 2749 + www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909 + www.nomos-elibrary.de | redirect-loop | 819 + www.osti.gov | | 275 + www.oxfordhandbooks.com | | 248 + www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224 + www.pdcnet.org | | 217 + www.researchsquare.com | | 483 + www.researchsquare.com | spn2-cdx-lookup-failure | 317 + www.scielo.br | | 319 + www.scielo.br | spn2-cdx-lookup-failure | 222 + www.sciencedirect.com | | 3384 + www.sciencedirect.com | spn2-cdx-lookup-failure | 3267 + www.spiedigitallibrary.org | | 441 + www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327 + www.tandfonline.com | | 2401 + www.tandfonline.com | spn2-cdx-lookup-failure | 1552 + www.tandfonline.com | no-pdf-link | 303 + www.tandfonline.com | blocked-cookie | 250 + www.taylorfrancis.com | | 1232 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 908 + www.thieme-connect.de | | 520 + www.thieme-connect.de | spn2-cdx-lookup-failure | 366 + www.worldscientific.com | | 383 + www.worldscientific.com | spn2-cdx-lookup-failure | 276 + zenodo.org | | 10625 + zenodo.org | spn2-cdx-lookup-failure | 7777 + zenodo.org | success | 1574 + zenodo.org | no-pdf-link | 1160 + zivahub.uct.ac.za | | 3428 + zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845 + zivahub.uct.ac.za | no-pdf-link | 583 + | | 130491 + | spn2-cdx-lookup-failure | 95169 + | success | 13354 + | no-pdf-link | 9621 + | terminal-bad-status | 3385 + | spn2-backoff | 2396 + | redirect-loop | 2216 + | link-loop | 1850 + | gateway-timeout | 1061 + | spn2-error:blocked-url | 428 + | blocked-cookie | 415 + | spn2-error | 246 +(182 rows) + +---- + +The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a +week or two, when crawling and retries are running smoothly, and see what +things look like then. diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt new file mode 100644 index 0000000..8d98217 --- /dev/null +++ b/sql/stats/2022-05-11_crawl_changelog.txt @@ -0,0 +1,410 @@ + domain | status | count +-----------------------------------------------------------------+-------------------------+-------- + academic.oup.com | | 2210 + academic.oup.com | no-pdf-link | 1350 + academic.oup.com | bad-redirect | 510 + academiccommons.columbia.edu | | 379 + academiccommons.columbia.edu | success | 339 + aip.scitation.org | | 762 + aip.scitation.org | terminal-bad-status | 430 + apps.crossref.org | | 9894 + apps.crossref.org | no-pdf-link | 9886 + apps.euskadi.eus | | 242 + apps.euskadi.eus | no-pdf-link | 240 + arxiv.org | | 44889 + arxiv.org | success | 28781 + arxiv.org | spn2-backoff | 7975 + arxiv.org | terminal-bad-status | 4508 + arxiv.org | spn2-cdx-lookup-failure | 2010 + arxiv.org | redirect-loop | 619 + arxiv.org | no-pdf-link | 242 + arxiv.org | spn2-error | 236 + asa.scitation.org | | 356 + asa.scitation.org | terminal-bad-status | 299 + asmedigitalcollection.asme.org | | 240 + assets.cureus.com | | 336 + assets.cureus.com | success | 335 + assets.researchsquare.com | | 1042 + assets.researchsquare.com | success | 993 + av.tib.eu | | 205 + av.tib.eu | no-pdf-link | 203 + bibliographie.uni-tuebingen.de | | 213 + bibliographie.uni-tuebingen.de | no-pdf-link | 211 + biorxiv.org | redirect-loop | 217 + biorxiv.org | | 217 + books.openedition.org | | 691 + books.openedition.org | no-pdf-link | 687 + boris.unibe.ch | | 525 + boris.unibe.ch | success | 466 + bridges.monash.edu | | 663 + bridges.monash.edu | no-pdf-link | 647 + brill.com | | 860 + brill.com | success | 434 + chemrxiv.org | | 201 + classiques-garnier.com | | 242 + content.iospress.com | | 325 + content.iospress.com | link-loop | 247 + core.tdar.org | | 216 + core.tdar.org | no-pdf-link | 211 + cyberleninka.ru | | 646 + cyberleninka.ru | success | 620 + d197for5662m48.cloudfront.net | | 263 + d197for5662m48.cloudfront.net | success | 262 + dergipark.org.tr | | 891 + dergipark.org.tr | success | 526 + dergipark.org.tr | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 427 + digi.ub.uni-heidelberg.de | no-pdf-link | 427 + direct.mit.edu | | 268 + direct.mit.edu | no-pdf-link | 208 + dl.acm.org | | 1719 + dl.acm.org | success | 829 + dl.acm.org | no-pdf-link | 546 + dl.acm.org | terminal-bad-status | 205 + dlc.library.columbia.edu | | 385 + dlc.library.columbia.edu | terminal-bad-status | 319 + doi.ala.org.au | | 724 + doi.ala.org.au | no-pdf-link | 721 + doi.apa.org | | 214 + doi.org | | 3390 + doi.org | terminal-bad-status | 2938 + doi.org | redirect-loop | 233 + doi.org | spn2-wayback-error | 208 + doi.usp.org | | 325 + doi.usp.org | no-pdf-link | 324 + downloads.hindawi.com | | 1439 + downloads.hindawi.com | success | 1436 + du.diva-portal.org | | 589 + du.diva-portal.org | success | 586 + econtents.bc.unicamp.br | | 310 + econtents.bc.unicamp.br | success | 310 + ediss.uni-goettingen.de | | 728 + ediss.uni-goettingen.de | success | 425 + elibrary.kdpu.edu.ua | | 907 + elibrary.kdpu.edu.ua | bad-redirect | 712 + elibrary.ru | | 925 + elibrary.ru | terminal-bad-status | 492 + elibrary.ru | bad-redirect | 230 + elibrary.vdi-verlag.de | | 393 + elifesciences.org | | 296 + elifesciences.org | success | 276 + europepmc.org | | 3024 + europepmc.org | success | 2541 + europepmc.org | terminal-bad-status | 463 + figshare.com | | 493 + figshare.com | no-pdf-link | 440 + files.osf.io | | 883 + files.osf.io | success | 686 + fjfsdata01prod.blob.core.windows.net | | 3869 + fjfsdata01prod.blob.core.windows.net | success | 3818 + ieeexplore.ieee.org | | 10854 + ieeexplore.ieee.org | gateway-timeout | 5495 + ieeexplore.ieee.org | spn2-backoff | 1662 + ieeexplore.ieee.org | no-pdf-link | 1417 + ieeexplore.ieee.org | success | 1410 + ieeexplore.ieee.org | redirect-loop | 768 + iiif.crossasia.org | | 7608 + iiif.crossasia.org | no-pdf-link | 7568 + ikee.lib.auth.gr | | 450 + ikee.lib.auth.gr | success | 332 + ins.journals.ekb.eg | | 212 + iopscience.iop.org | | 268 + jamanetwork.com | | 333 + journals.aps.org | | 414 + journals.asm.org | | 242 + journals.flvc.org | | 245 + journals.flvc.org | success | 242 + journals.healio.com | | 755 + journals.healio.com | terminal-bad-status | 668 + journals.lincoln.ac.nz | | 244 + journals.lincoln.ac.nz | success | 239 + journals.lww.com | | 1772 + journals.lww.com | link-loop | 1425 + journals.lww.com | spn2-backoff | 209 + journals.openedition.org | | 1192 + journals.openedition.org | redirect-loop | 467 + journals.openedition.org | success | 451 + journals.plos.org | | 771 + journals.plos.org | success | 750 + journals.ub.uni-heidelberg.de | | 787 + journals.ub.uni-heidelberg.de | success | 741 + kazanmedjournal.ru | | 240 + kazanmedjournal.ru | success | 231 + kiss.kstudy.com | | 219 + kiss.kstudy.com | no-pdf-link | 218 + kluwerlawonline.com | | 444 + kluwerlawonline.com | link-loop | 402 + libraetd.lib.virginia.edu | | 362 + libraetd.lib.virginia.edu | no-pdf-link | 361 + link.springer.com | | 305 + linkinghub.elsevier.com | | 568 + linkinghub.elsevier.com | spn2-backoff | 545 + ltu-figshare-repo.s3.aarnet.edu.au | | 269 + ltu-figshare-repo.s3.aarnet.edu.au | success | 268 + mausamjournal.imd.gov.in | | 202 + mdpi-res.com | | 8892 + mdpi-res.com | success | 8863 + mededpublish.org | | 1900 + mededpublish.org | no-pdf-link | 1900 + meetingorganizer.copernicus.org | | 276 + meetingorganizer.copernicus.org | no-pdf-link | 271 + muse.jhu.edu | | 1047 + muse.jhu.edu | terminal-bad-status | 755 + muse.jhu.edu | link-loop | 203 + online.ucpress.edu | | 358 + online.ucpress.edu | link-loop | 212 + onlinelibrary.wiley.com | | 5813 + onlinelibrary.wiley.com | terminal-bad-status | 4587 + onlinelibrary.wiley.com | spn2-wayback-error | 614 + onlinelibrary.wiley.com | blocked-cookie | 381 + open.library.ubc.ca | | 206 + opendata.uni-halle.de | | 1768 + opendata.uni-halle.de | success | 1215 + opendata.uni-halle.de | wrong-mimetype | 260 + opendata2.uni-halle.de | | 206 + opg.optica.org | | 205 + osf.io | | 2949 + osf.io | no-pdf-link | 2404 + osf.io | spn2-backoff | 299 + papers.ssrn.com | | 3962 + papers.ssrn.com | link-loop | 3800 + peerj.com | | 273 + preprints.jmir.org | | 275 + preprints.jmir.org | cdx-error | 255 + publikationen.bibliothek.kit.edu | | 213 + publons.com | | 593 + publons.com | no-pdf-link | 590 + pubs.acs.org | | 2288 + pubs.acs.org | terminal-bad-status | 1841 + pubs.acs.org | spn2-wayback-error | 210 + pubs.rsc.org | | 1698 + pubs.rsc.org | bad-redirect | 811 + pubs.rsc.org | link-loop | 352 + pubs.rsc.org | success | 307 + radiopaedia.org | | 220 + read.dukeupress.edu | | 303 + repositories.lib.utexas.edu | | 1570 + repositories.lib.utexas.edu | bad-redirect | 513 + repositories.lib.utexas.edu | spn2-backoff | 383 + repositories.lib.utexas.edu | gateway-timeout | 379 + repositories.lib.utexas.edu | terminal-bad-status | 282 + repository.uj.ac.za | | 489 + repository.uj.ac.za | no-pdf-link | 365 + repository.unsworks.unsw.edu.au | | 397 + repository.urosario.edu.co | | 2429 + repository.urosario.edu.co | success | 1648 + repository.urosario.edu.co | bad-redirect | 613 + rex.libraries.wsu.edu | no-pdf-link | 241 + rex.libraries.wsu.edu | | 241 + rsdjournal.org | | 208 + rsdjournal.org | success | 208 + s3-ap-southeast-2.amazonaws.com | | 282 + s3-ap-southeast-2.amazonaws.com | success | 277 + s3-eu-west-1.amazonaws.com | | 4615 + s3-eu-west-1.amazonaws.com | success | 4593 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237 + sage.figshare.com | | 415 + sage.figshare.com | no-pdf-link | 385 + scholar.dkyobobook.co.kr | | 512 + scholar.dkyobobook.co.kr | no-pdf-link | 509 + scholarlypublishingcollective.org | | 287 + scholarworks.gsu.edu | | 1132 + scholarworks.gsu.edu | success | 1000 + scholarworks.iupui.edu | | 205 + scholarworks.umass.edu | | 417 + scholarworks.umass.edu | success | 400 + sciencescholar.us | | 404 + secure.jbs.elsevierhealth.com | | 727 + secure.jbs.elsevierhealth.com | terminal-bad-status | 722 + tandf.figshare.com | | 354 + tandf.figshare.com | no-pdf-link | 342 + unsworks.unsw.edu.au | | 408 + unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342 + valep.vc.univie.ac.at | no-pdf-link | 737 + valep.vc.univie.ac.at | | 737 + watermark.silverchair.com | | 1604 + watermark.silverchair.com | success | 1598 + wayf.switch.ch | | 215 + wayf.switch.ch | no-pdf-link | 213 + www.ahajournals.org | | 438 + www.ahajournals.org | no-pdf-link | 306 + www.ahbps.org | | 316 + www.ahbps.org | success | 312 + www.atenaeditora.com.br | | 390 + www.atenaeditora.com.br | terminal-bad-status | 333 + www.atlantis-press.com | | 914 + www.atlantis-press.com | success | 901 + www.atsjournals.org | | 1245 + www.atsjournals.org | success | 1189 + www.biorxiv.org | | 712 + www.biorxiv.org | success | 670 + www.bloomsburycollections.com | | 982 + www.bloomsburycollections.com | terminal-bad-status | 566 + www.cahiers-clsl.ch | | 305 + www.cahiers-clsl.ch | success | 298 + www.cairn.info | | 1799 + www.cairn.info | no-pdf-link | 662 + www.cairn.info | link-loop | 487 + www.cairn.info | success | 355 + www.cairn.info | terminal-bad-status | 267 + www.cambridge.org | | 3258 + www.cambridge.org | no-pdf-link | 1682 + www.cambridge.org | success | 682 + www.cambridge.org | bad-redirect | 404 + www.cambridge.org | link-loop | 302 + www.dbpia.co.kr | | 763 + www.dbpia.co.kr | no-pdf-link | 443 + www.dbpia.co.kr | redirect-loop | 287 + www.degruyter.com | | 12655 + www.degruyter.com | no-pdf-link | 9112 + www.degruyter.com | success | 2898 + www.degruyter.com | spn2-backoff | 507 + www.dib.ie | | 1381 + www.dib.ie | no-pdf-link | 1378 + www.dovepress.com | | 231 + www.dovepress.com | success | 216 + www.e-manuscripta.ch | | 767 + www.e-manuscripta.ch | success | 399 + www.e-periodica.ch | | 1406 + www.e-periodica.ch | no-pdf-link | 1402 + www.e-rara.ch | no-pdf-link | 251 + www.e-rara.ch | | 251 + www.editoracientifica.org | no-pdf-link | 205 + www.editoracientifica.org | | 205 + www.elgaronline.com | | 427 + www.elibrary.ru | | 616 + www.elibrary.ru | terminal-bad-status | 364 + www.elibrary.ru | no-pdf-link | 216 + www.emerald.com | | 862 + www.emerald.com | no-pdf-link | 724 + www.endocrine-abstracts.org | | 1907 + www.endocrine-abstracts.org | no-pdf-link | 1905 + www.eurekaselect.com | | 285 + www.eurekaselect.com | link-loop | 246 + www.even3.com.br | | 233 + www.frontiersin.org | | 585 + www.frontiersin.org | spn2-backoff | 436 + www.humankineticslibrary.com | no-pdf-link | 207 + www.humankineticslibrary.com | | 207 + www.igi-global.com | | 1600 + www.igi-global.com | no-pdf-link | 1199 + www.igi-global.com | bad-redirect | 258 + www.inderscience.com | | 385 + www.inderscience.com | no-pdf-link | 365 + www.inderscienceonline.com | | 202 + www.ingentaconnect.com | | 450 + www.ingentaconnect.com | no-pdf-link | 260 + www.jstage.jst.go.jp | | 1248 + www.jstage.jst.go.jp | success | 870 + www.karger.com | | 313 + www.liebertpub.com | | 271 + www.liebertpub.com | no-pdf-link | 241 + www.nicecjournal.co.uk | | 274 + www.nicecjournal.co.uk | success | 274 + www.nomos-elibrary.de | | 1771 + www.nomos-elibrary.de | no-pdf-link | 788 + www.nomos-elibrary.de | redirect-loop | 506 + www.nomos-elibrary.de | bad-redirect | 207 + www.osti.gov | | 381 + www.osti.gov | link-loop | 326 + www.persee.fr | | 277 + www.preprints.org | | 225 + www.preprints.org | success | 225 + www.protocols.io | | 770 + www.protocols.io | success | 485 + www.repository.cam.ac.uk | | 510 + www.repository.cam.ac.uk | bad-redirect | 213 + www.research-collection.ethz.ch | | 416 + www.research-collection.ethz.ch | bad-redirect | 249 + www.researchsquare.com | | 1121 + www.researchsquare.com | bad-redirect | 985 + www.scielo.br | | 828 + www.scielo.br | success | 641 + www.sciencedirect.com | | 8567 + www.sciencedirect.com | terminal-bad-status | 5773 + www.sciencedirect.com | spn2-wayback-error | 1590 + www.sciencedirect.com | no-pdf-link | 576 + www.sciencedirect.com | spn2-backoff | 479 + www.sciendo.com | | 257 + www.sciendo.com | success | 222 + www.scitepress.org | | 381 + www.scitepress.org | no-pdf-link | 377 + www.spiedigitallibrary.org | | 1061 + www.spiedigitallibrary.org | bad-redirect | 571 + www.spiedigitallibrary.org | gateway-timeout | 233 + www.tandfonline.com | | 4934 + www.tandfonline.com | no-pdf-link | 2088 + www.tandfonline.com | terminal-bad-status | 1282 + www.tandfonline.com | blocked-cookie | 757 + www.tandfonline.com | redirect-loop | 488 + www.tandfonline.com | spn2-wayback-error | 202 + www.taylorfrancis.com | | 3979 + www.taylorfrancis.com | link-loop | 1928 + www.taylorfrancis.com | no-pdf-link | 1840 + www.techniques-ingenieur.fr | | 354 + www.techniques-ingenieur.fr | no-pdf-link | 353 + www.thieme-connect.de | | 1987 + www.thieme-connect.de | no-pdf-link | 949 + www.thieme-connect.de | link-loop | 869 + www.tib.eu | no-pdf-link | 315 + www.tib.eu | | 315 + www.un-ilibrary.org | no-pdf-link | 352 + www.un-ilibrary.org | | 352 + www.worldscientific.com | | 668 + www.worldscientific.com | no-pdf-link | 629 + www.zora.uzh.ch | | 318 + zenodo.org | | 46585 + zenodo.org | no-pdf-link | 29519 + zenodo.org | success | 14768 + zenodo.org | terminal-bad-status | 810 + zenodo.org | wrong-mimetype | 691 + zenodo.org | spn2-cdx-lookup-failure | 395 + zenodo.org | spn2-backoff | 294 + zivahub.uct.ac.za | | 1909 + zivahub.uct.ac.za | no-pdf-link | 1880 + zop.zb.uzh.ch | | 228 + zop.zb.uzh.ch | success | 217 + | | 365582 + | success | 141497 38.7% + | no-pdf-link | 120852 33.0% + | terminal-bad-status | 31900 8.7% + | spn2-backoff | 16979 4.6% + | link-loop | 13624 3.7% + | bad-redirect | 8736 + | redirect-loop | 7405 + | gateway-timeout | 6997 + | spn2-cdx-lookup-failure | 5146 + | spn2-wayback-error | 3708 + | wrong-mimetype | 2158 + | blocked-cookie | 1942 + | spn2-error:blocked-url | 1733 + | wayback-error | 1063 + | spn2-error | 647 + | spn2-error:500 | 265 + | cdx-error | 257 +(383 rows) + +---- + +365k in 7 days is about 52k a day, which is about expected. Around 5-7% need +retries. + +important changes: +- biorxiv.org: needs fix and then retries +- academic.oup.com: should probably skip +- apps.crossref.org: need to handle this in code +- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/` +- doi.org: investigate redirect-loop and terminal-bad-status +- osf.io: not getting PDFs +- papers.ssrn.com: why are these attempted? +- publons.com: not getting PDFs; special case these? +- www.sciencedirect.com: not working at all? + +smaller: +- bridges.monash.edu: fix, then retry? +- dl.acm.org: some broader retries? +- figshare.com: still some attempts, but almost all no-pdf-link +- onlinelibrary.wiley.com: getting blocked broadly? +- www.endocrine-abstracts.org: HTML content? +- www.igi-global.com: no-pdf-link diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 62 GB | 44 GB | 106 GB + "public"."ingest_request" | 51 GB | 50 GB | 101 GB + "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB + "public"."file_meta" | 39 GB | 39 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7469 MB | 66 MB | 7535 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 198175106 | 282695671015403 + (1 row) + + 198 million files, 282 TBytes. + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 197021437 + text/html | 830331 + application/octet-stream | 186669 + application/xml | 42170 + application/xhtml+xml | 38207 + text/plain | 16471 + application/jats+xml | 10385 + application/gzip | 6681 + | 6032 + application/postscript | 4916 + image/jpeg | 4522 + application/vnd.ms-powerpoint | 1672 + application/msword | 946 + application/x-bzip2 | 891 + image/png | 659 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 + application/x-compress | 274 + video/mp4 | 218 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12800 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 137283420 | 172140506 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 157465613 + warc/revisit | 11337336 + text/html | 1137208 + application/octet-stream | 950380 + text/xml | 528965 + unk | 253294 + application/postscript | 81130 + application/save | 81069 + binary/octet-stream | 68942 + application/x-download | 42717 + application/download | 40628 + image/pdf | 39904 + text/plain | 36445 + application/force-download | 24148 + multipart/form-data | 10972 + application | 5409 + application/x-octetstream | 5192 + application/x-msdownload | 3854 + .pdf | 3518 + application/x-pdf | 3061 + application/octet | 1792 + pdf | 1757 + application/binary | 1399 + file | 1373 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1087 + application/unknown | 1005 + 0 | 773 + text/pdf | 729 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 129001717 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 120797098 + 500 | 8198783 + -4 | 5802 + 503 | 36 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 60469462 + 0.5.5-fatcat | 47472904 + | 12665498 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | doi | 43852308 + pdf | mag | 43701948 + pdf | doaj | 6534341 + html | doaj | 3987669 + pdf | arxiv | 2784589 + pdf | pmc | 2439181 + pdf | dblp | 631716 + html | doi | 126699 + xml | doaj | 23066 + pdf | cnki_covid19 | 2034 + pdf | spn | 1026 + pdf | wanfang_covid19 | 975 + html | spn | 65 + xml | spn | 2 + xml | doi | 1 + (17 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 24742500 + pdf | doi | fatcat-ingest | 15592121 + pdf | doaj | doaj | 6484737 + html | doaj | doaj | 3987468 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 799793 + pdf | dblp | dblp | 631716 + pdf | pmc | fatcat-ingest | 297980 + html | doi | fatcat-ingest | 121508 + pdf | pmc | fatcat-changelog | 112376 + pdf | doaj | fatcat-changelog | 47181 + xml | doaj | doaj | 23066 + html | doi | fatcat-changelog | 5129 + pdf | doaj | fatcat-ingest | 2423 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1814 + pdf | spn | savepapernow-web | 1026 + pdf | wanfang_covid19 | scrape-covid19 | 975 + html | doaj | fatcat-ingest | 201 + html | spn | savepapernow-web | 65 + html | doi | savepapernow-web | 62 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 4 + xml | spn | savepapernow-web | 2 + xml | doi | savepapernow-web | 1 + (30 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 167653 + pdf | doaj | 81517 + pdf | oai | 15282 + html | doaj | 1791 + pdf | unpaywall | 270 + pdf | doi | 22 + (6 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 16024068 | 0.313 + pdf | unpaywall | 43932525 | 36045446 | 0.820 + pdf | doi | 43852308 | 14956080 | 0.341 + pdf | mag | 43701948 | 32768484 | 0.750 + pdf | doaj | 6534341 | 4704066 | 0.720 + html | doaj | 3987669 | 778165 | 0.195 + pdf | arxiv | 2784589 | 2419941 | 0.869 + pdf | pmc | 2439181 | 1897671 | 0.778 + pdf | dblp | 631716 | 305142 | 0.483 + html | doi | 126699 | 75754 | 0.598 + xml | doaj | 23066 | 10381 | 0.450 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | spn | 1026 | 778 | 0.758 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + html | spn | 65 | 13 | 0.200 + xml | spn | 2 | 1 | 0.500 + xml | doi | 1 | 0 | 0.000 + (17 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 94887295 + pdf | no-pdf-link | 33960080 + pdf | no-capture | 20893916 + pdf | terminal-bad-status | 6973765 + pdf | redirect-loop | 5775175 + pdf | link-loop | 4095424 + pdf | skip-url-blocklist | 4037518 + pdf | blocked-cookie | 3508762 + html | wrong-scope | 1783694 + pdf | wrong-mimetype | 1379673 + html | success | 853762 + pdf | gateway-timeout | 635170 + html | no-capture | 381283 + pdf | wayback-content-error | 356694 + pdf | cdx-error | 347700 + pdf | null-body | 336166 + html | unknown-scope | 321874 + html | html-resource-no-capture | 294294 + pdf | forbidden | 291127 + pdf | not-found | 274343 + pdf | too-many-redirects | 264494 + component | wrong-mimetype | 196680 + component | spn2-cdx-lookup-failure | 173615 + component | spn2-backoff | 115840 + html | terminal-bad-status | 106264 + html | null-body | 100296 + pdf | wayback-error | 94748 + html | blocked-cookie | 88537 + component | no-capture | 75278 + pdf | empty-blob | 61157 + pdf | bad-redirect | 58680 + pdf | skip-wall | 57751 + pdf | spn2-error:too-many-redirects | 52873 + html | spn2-backoff | 50577 + pdf | remote-server-error | 41282 + pdf | invalid-host-resolution | 38864 + pdf | read-timeout | 37071 + pdf | spn2-cdx-lookup-failure | 34229 + html | wrong-mimetype | 33643 + pdf | spn2-backoff | 32437 + pdf | petabox-error | 31006 + html | wayback-content-error | 28034 + component | spn2-error | 27044 + pdf | spn2-error:unknown | 25810 + component | gateway-timeout | 25215 + pdf | body-too-large | 21721 + html | petabox-error | 18313 + html | empty-blob | 14393 + html | redirect-loop | 13404 + component | blocked-cookie | 12287 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 45052391 + pdf | | 26117481 + pdf | 301 | 4814786 + html | 200 | 2684821 + pdf | 403 | 1871088 + pdf | 404 | 1254259 + pdf | 302 | 898728 + pdf | 503 | 867548 + pdf | 401 | 851205 + pdf | 429 | 741869 + pdf | 400 | 624519 + component | | 456915 + html | | 442051 + pdf | 500 | 283700 + component | 200 | 197510 + pdf | 410 | 120647 + pdf | 303 | 107947 + html | 404 | 80114 + pdf | 420 | 26722 + pdf | 502 | 19500 + pdf | 409 | 15499 + html | 429 | 15208 + pdf | 509 | 15167 + pdf | 999 | 12186 + pdf | 202 | 11535 + html | 301 | 10213 + xml | | 10018 + pdf | 307 | 8657 + pdf | 402 | 8338 + pdf | 412 | 8064 + pdf | 308 | 6479 + html | 500 | 4746 + xml | 200 | 2668 + pdf | 520 | 2496 + html | 302 | 2289 + pdf | 521 | 2257 + html | 202 | 2177 + pdf | 206 | 1961 + html | 403 | 1775 + pdf | 504 | 1187 + pdf | 421 | 1148 + html | 303 | 1112 + pdf | 406 | 1109 + pdf | 204 | 772 + pdf | 432 | 745 + pdf | 405 | 633 + html | 400 | 632 + pdf | 426 | 515 + pdf | 508 | 503 + pdf | 505 | 469 + (50 rows) diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt new file mode 100644 index 0000000..0a6254a --- /dev/null +++ b/sql/stats/2022-11-23_table_sizes.txt @@ -0,0 +1,21 @@ +PostgreSQL 13.2 - wbgrp-svc506.us.archive.org +Size: 1.13T + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 63 GB | 45 GB | 108 GB + "public"."ingest_request" | 53 GB | 52 GB | 105 GB + "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB + "public"."file_meta" | 39 GB | 40 GB | 79 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7879 MB | 68 MB | 7947 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes diff --git a/sql/stats/README.md b/sql/stats/README.md index 52642f6..3161514 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -29,7 +29,7 @@ Counts and total file size: Top mimetypes: - SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; Missing full metadata: @@ -43,25 +43,21 @@ Total and unique-by-sha1 counts: mimetype counts: - SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; - -Processed or not: - - # TODO: + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; ## GROBID Counts: - SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + SELECT COUNT(*) AS total_files FROM grobid; Status? - SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; What version used? - SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; ## Petabox @@ -75,7 +71,7 @@ Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; - SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; Uncrawled requests by source: @@ -86,7 +82,7 @@ Uncrawled requests by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type WHERE ingest_file_result.base_url IS NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; Results by source: @@ -101,20 +97,13 @@ Results by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type AND ingest_file_result.ingest_type IS NOT NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; Ingest result by status: - SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; - -## Fatcat Files + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; -Count of PDF files that GROBID processed and matched to a release (via -glutton), but no PDF in `fatcat_file`: +Failed ingest by terminal status code: - SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count - FROM grobid - LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex - WHERE fatcat_file.sha1hex IS NULL - AND grobid.fatcat_release IS NOT NULL; + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; |