aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
Diffstat (limited to 'sql')
-rw-r--r--sql/Makefile35
-rw-r--r--sql/dump_reingest_bulk.sql31
-rw-r--r--sql/dump_reingest_old.sql36
-rw-r--r--sql/dump_reingest_quarterly.sql32
-rw-r--r--sql/dump_reingest_spn.sql25
-rw-r--r--sql/dump_reingest_terminalstatus.sql34
-rw-r--r--sql/dump_reingest_weekly.sql27
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql1
-rwxr-xr-xsql/reingest_bulk.sh19
-rwxr-xr-xsql/reingest_old.sh19
-rwxr-xr-xsql/reingest_quarterly.sh2
-rwxr-xr-xsql/reingest_spn.sh2
-rwxr-xr-xsql/reingest_terminalstatus_forcerecrawl.sh19
-rwxr-xr-xsql/reingest_weekly.sh4
-rw-r--r--sql/stats/2022-04-26_stats.txt432
-rw-r--r--sql/stats/2022-04-27_crawl_changelog.txt191
-rw-r--r--sql/stats/2022-05-11_crawl_changelog.txt410
-rw-r--r--sql/stats/2022-09-06_stats.txt438
-rw-r--r--sql/stats/2022-11-23_table_sizes.txt21
19 files changed, 1751 insertions, 27 deletions
diff --git a/sql/Makefile b/sql/Makefile
new file mode 100644
index 0000000..860addb
--- /dev/null
+++ b/sql/Makefile
@@ -0,0 +1,35 @@
+
+SHELL=/bin/bash -euo pipefail
+TODAY ?= $(shell date --iso --utc)
+DATADIR ?= /srv/sandcrawler/tasks/$(TODAY)
+DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S)
+DATABASE_URL ?= sandcrawler
+
+.PHONY: help
+help: ## Print info about all commands
+ @echo "Commands:"
+ @echo
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}'
+
+.PHONY: create_datadir
+create_datadir:
+ mkdir -p $(DATADIR)/
+ sudo chmod a+rw $(DATADIR)/
+
+$(DATADIR)/.DB_DUMP:
+ sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip
+ mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump
+ touch $@
+
+.PHONY: database-snapshot
+database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot
+ @echo
+
+$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))"
+ ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump
+ touch $@
+
+.PHONY: upload-database-snapshot
+upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org
+ @echo
diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql
new file mode 100644
index 0000000..698db7a
--- /dev/null
+++ b/sql/dump_reingest_bulk.sql
@@ -0,0 +1,31 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html')
+ AND ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '24 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql
new file mode 100644
index 0000000..7473420
--- /dev/null
+++ b/sql/dump_reingest_old.sql
@@ -0,0 +1,36 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '6 day'::INTERVAL
+ -- AND ingest_request.created > NOW() - '181 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
+ AND (
+ ingest_file_result.status like 'spn2-%'
+ -- OR ingest_file_result.status like 'no-capture'
+ -- OR ingest_file_result.status like 'cdx-error'
+ -- OR ingest_file_result.status like 'petabox-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:filesize-limit'
+ AND ingest_file_result.status != 'spn2-error:not-found'
+ AND ingest_file_result.status != 'spn2-error:blocked-url'
+ AND ingest_file_result.status != 'spn2-error:too-many-redirects'
+ AND ingest_file_result.status != 'spn2-error:network-authentication-required'
+ AND ingest_file_result.status != 'spn2-error:unknown'
+) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql
index 725a404..dbeb199 100644
--- a/sql/dump_reingest_quarterly.sql
+++ b/sql/dump_reingest_quarterly.sql
@@ -1,20 +1,35 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '91 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container'
+ OR ingest_request.ingest_request_source = 'unpaywall'
+ OR ingest_request.ingest_request_source = 'arxiv'
+ OR ingest_request.ingest_request_source = 'pmc'
+ OR ingest_request.ingest_request_source = 'doaj'
+ OR ingest_request.ingest_request_source = 'dblp')
AND (
ingest_file_result.status like 'spn2-%'
- OR ingest_file_result.status like 'cdx-error'
- OR ingest_file_result.status like 'wayback-error'
- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- OR ingest_file_result.status like 'gateway-timeout'
+ OR ingest_file_result.status = 'cdx-error'
+ OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -29,3 +44,4 @@ COPY (
-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql
index 6ef08c1..a83125c 100644
--- a/sql/dump_reingest_spn.sql
+++ b/sql/dump_reingest_spn.sql
@@ -1,19 +1,28 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '6 hour'::INTERVAL
AND ingest_request.created > NOW() - '180 day'::INTERVAL
AND ingest_request.ingest_request_source = 'savepapernow-web'
AND (
ingest_file_result.status like 'spn2-%'
- -- OR ingest_file_result.status like 'cdx-error'
- -- OR ingest_file_result.status like 'wayback-error'
- -- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- -- OR ingest_file_result.status like 'gateway-timeout'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -23,3 +32,5 @@ COPY (
AND ingest_file_result.status != 'spn2-error:network-authentication-required'
AND ingest_file_result.status != 'spn2-error:unknown'
) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json';
+
+ROLLBACK;
diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql
new file mode 100644
index 0000000..b72a096
--- /dev/null
+++ b/sql/dump_reingest_terminalstatus.sql
@@ -0,0 +1,34 @@
+
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
+COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ ingest_file_result.hit = false
+ AND ingest_request.created < NOW() - '72 hour'::INTERVAL
+ AND ingest_request.created > NOW() - '10 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ AND ingest_file_result.status = 'terminal-bad-status'
+ AND (
+ ingest_file_result.terminal_status_code = 500
+ OR ingest_file_result.terminal_status_code = 502
+ OR ingest_file_result.terminal_status_code = 503
+ OR ingest_file_result.terminal_status_code = 429
+ OR ingest_file_result.terminal_status_code = 404
+ )
+ AND (
+ ingest_request.base_url LIKE 'https://doi.org/10.3390/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%'
+ OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%'
+ )
+) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json';
+
+-- bulk re-tries would be:
+-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+
+ROLLBACK;
diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql
index 65800eb..a019938 100644
--- a/sql/dump_reingest_weekly.sql
+++ b/sql/dump_reingest_weekly.sql
@@ -1,20 +1,30 @@
+BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE;
+
COPY (
SELECT row_to_json(ingest_request.*) FROM ingest_request
- LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
- WHERE ingest_request.ingest_type = 'pdf'
+ LEFT JOIN ingest_file_result ON
+ ingest_file_result.base_url = ingest_request.base_url
+ AND ingest_file_result.ingest_type = ingest_request.ingest_type
+ WHERE
+ (ingest_request.ingest_type = 'pdf'
+ OR ingest_request.ingest_type = 'html'
+ OR ingest_request.ingest_type = 'xml'
+ OR ingest_request.ingest_type = 'component')
AND ingest_file_result.hit = false
AND ingest_request.created < NOW() - '8 hour'::INTERVAL
AND ingest_request.created > NOW() - '8 day'::INTERVAL
AND (ingest_request.ingest_request_source = 'fatcat-changelog'
- OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ OR ingest_request.ingest_request_source = 'fatcat-ingest'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest-container')
AND (
ingest_file_result.status like 'spn2-%'
- -- OR ingest_file_result.status like 'cdx-error'
- -- OR ingest_file_result.status like 'wayback-error'
- -- OR ingest_file_result.status like 'wayback-content-error'
- OR ingest_file_result.status like 'petabox-error'
- -- OR ingest_file_result.status like 'gateway-timeout'
+ -- OR ingest_file_result.status = 'cdx-error'
+ -- OR ingest_file_result.status = 'wayback-error'
+ -- OR ingest_file_result.status = 'wayback-content-error'
+ OR ingest_file_result.status = 'petabox-error'
+ -- OR ingest_file_result.status = 'gateway-timeout'
+ OR ingest_file_result.status = 'no-capture'
)
AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
AND ingest_file_result.status != 'spn2-error:filesize-limit'
@@ -29,3 +39,4 @@ COPY (
-- AND (ingest_request.ingest_request_source != 'fatcat-changelog'
-- AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ROLLBACK;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 23a935e..33dba66 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -149,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh
new file mode 100755
index 0000000..d39a171
--- /dev/null
+++ b/sql/reingest_bulk.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_bulk_current.json
+
+cat /srv/sandcrawler/tasks/reingest_bulk_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh
new file mode 100755
index 0000000..96e5416
--- /dev/null
+++ b/sql/reingest_old.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_old.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_old_current.json
+
+cat /srv/sandcrawler/tasks/reingest_old_current.json \
+ | shuf \
+ | head -n1000000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh
index 20fd82b..8a2996c 100755
--- a/sql/reingest_quarterly.sh
+++ b/sql/reingest_quarterly.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \
| shuf \
| head -n120000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh
index 6fb1e4b..c693a64 100755
--- a/sql/reingest_spn.sh
+++ b/sql/reingest_spn.sh
@@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \
| shuf \
| head -n60000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1
diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh
new file mode 100755
index 0000000..5cb6d51
--- /dev/null
+++ b/sql/reingest_terminalstatus_forcerecrawl.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -e # fail on error
+set -u # fail if variable not set in substitution
+set -o pipefail # fail if part of a '|' command fails
+
+sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql
+
+cd ../python
+sudo -u sandcrawler pipenv run \
+ ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \
+ > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json
+
+cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \
+ | shuf \
+ | head -n100000 \
+ | jq . -c \
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh
index 04ce39d..d2e2444 100755
--- a/sql/reingest_weekly.sh
+++ b/sql/reingest_weekly.sh
@@ -13,7 +13,7 @@ sudo -u sandcrawler pipenv run \
cat /srv/sandcrawler/tasks/reingest_weekly_current.json \
| shuf \
- | head -n60000 \
+ | head -n80000 \
| jq . -c \
- | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+ | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt
new file mode 100644
index 0000000..bd20c5c
--- /dev/null
+++ b/sql/stats/2022-04-26_stats.txt
@@ -0,0 +1,432 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 416 GB | 10 GB | 426 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 58 GB | 41 GB | 99 GB
+ "public"."ingest_request" | 50 GB | 48 GB | 98 GB
+ "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."file_meta" | 37 GB | 34 GB | 71 GB
+ "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB
+ "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 3015 MB | 31 MB | 3046 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 192402128 | 271919997557597
+ (1 row)
+
+ # 271,919,997,557,597 -> ~272 TByte
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 191760695
+ text/html | 330351
+ application/octet-stream | 186696
+ application/xml | 42170
+ application/xhtml+xml | 31470
+ text/plain | 16449
+ application/jats+xml | 6902
+ application/gzip | 6681
+ | 6033
+ application/postscript | 4916
+ image/jpeg | 2901
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 934
+ application/x-bzip2 | 891
+ image/png | 476
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294
+ application/x-compress | 274
+ video/mp4 | 150
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12831
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 130732381 | 162760251
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 149749828
+ warc/revisit | 10437210
+ application/octet-stream | 733161
+ text/html | 642992
+ text/xml | 525483
+ unk | 217642
+ application/postscript | 81127
+ application/save | 81023
+ binary/octet-stream | 67938
+ application/x-download | 41137
+ image/pdf | 39712
+ application/download | 37153
+ text/plain | 36342
+ application/force-download | 21496
+ multipart/form-data | 9792
+ application | 5366
+ application/x-octetstream | 5166
+ application/x-msdownload | 3851
+ .pdf | 3445
+ application/x-pdf | 3018
+ pdf | 1618
+ file | 1370
+ application/binary | 1354
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1047
+ application/unknown | 1001
+ 0 | 773
+ text/pdf | 729
+ application/blob | 673
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 123669603
+ (1 row)
+
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 115668412
+ 500 | 7995428
+ -4 | 5745
+ 503 | 18
+ (4 rows)
+
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 54780825
+ 0.5.5-fatcat | 48003940
+ | 12694404
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | mag | 43701948
+ pdf | doi | 40044585
+ pdf | doaj | 6016771
+ html | doaj | 3648181
+ pdf | arxiv | 2676200
+ pdf | pmc | 2402453
+ html | doi | 41492
+ xml | doaj | 20638
+ pdf | cnki_covid19 | 2034
+ pdf | wanfang_covid19 | 975
+ pdf | spn | 829
+ html | spn | 52
+ xml | doi | 1
+ xml | spn | 1
+ (16 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 20936949
+ pdf | doi | fatcat-ingest | 15590201
+ pdf | doaj | doaj | 6016771
+ html | doaj | doaj | 3648181
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 691405
+ pdf | pmc | fatcat-ingest | 297646
+ pdf | pmc | fatcat-changelog | 75982
+ html | doi | fatcat-ingest | 37904
+ xml | doaj | doaj | 20638
+ html | doi | fatcat-changelog | 3534
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1562
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ pdf | spn | savepapernow-web | 829
+ html | doi | savepapernow-web | 54
+ html | spn | savepapernow-web | 52
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 3
+ xml | doi | savepapernow-web | 1
+ xml | spn | savepapernow-web | 1
+ (26 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+ ingest_type | link_source | count
+ -------------+-------------+---------
+ pdf | doaj | 1619621
+ html | doaj | 1208412
+ pdf | mag | 167653
+ pdf | oai | 15282
+ xml | doaj | 11196
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (7 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 15968290 | 0.312
+ pdf | unpaywall | 43932525 | 32618045 | 0.742
+ pdf | mag | 43701948 | 32662926 | 0.747
+ pdf | doi | 40044738 | 10925369 | 0.273
+ pdf | doaj | 6016771 | 3042569 | 0.506
+ html | doaj | 3648181 | 344208 | 0.094
+ pdf | arxiv | 2676206 | 2269708 | 0.848
+ pdf | pmc | 2402453 | 1855679 | 0.772
+ html | doi | 41492 | 1739 | 0.042
+ xml | doaj | 20638 | 6899 | 0.334
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ pdf | spn | 829 | 616 | 0.743
+ html | spn | 52 | 7 | 0.135
+ xml | doi | 1 | 0 | 0.000
+ xml | spn | 1 | 0 | 0.000
+ (16 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+---------------------------------+----------
+ pdf | success | 85709322
+ pdf | no-pdf-link | 29713304
+ pdf | no-capture | 26632191
+ pdf | redirect-loop | 10979145
+ pdf | terminal-bad-status | 4977000
+ pdf | link-loop | 3434877
+ pdf | skip-url-blocklist | 3114258
+ pdf | blocked-cookie | 2156835
+ html | wrong-scope | 1126911
+ pdf | wrong-mimetype | 980546
+ pdf | gateway-timeout | 651562
+ pdf | spn2-cdx-lookup-failure | 484016
+ pdf | spn2-backoff | 399382
+ pdf | cdx-error | 373964
+ pdf | wayback-content-error | 354370
+ html | success | 345860
+ pdf | null-body | 336182
+ pdf | spn2-error:500 | 309755
+ pdf | forbidden | 291175
+ pdf | not-found | 275560
+ pdf | too-many-redirects | 262312
+ html | unknown-scope | 230352
+ html | redirect-loop | 226596
+ html | html-resource-no-capture | 205646
+ html | no-capture | 164014
+ component | spn2-cdx-lookup-failure | 148825
+ component | wrong-mimetype | 130344
+ html | null-body | 100296
+ pdf | wayback-error | 94286
+ pdf | spn2-wayback-error | 81365
+ component | no-capture | 75278
+ pdf | spn2-error | 69830
+ pdf | skip-wall | 57744
+ pdf | spn2-error:too-many-redirects | 53808
+ pdf | remote-server-error | 41286
+ pdf | petabox-error | 38800
+ pdf | invalid-host-resolution | 37337
+ pdf | read-timeout | 36872
+ component | spn2-backoff | 33217
+ pdf | empty-blob | 27946
+ component | spn2-error | 24078
+ pdf | spn2-error:unknown | 23697
+ component | gateway-timeout | 23139
+ html | wrong-mimetype | 22731
+ html | wayback-content-error | 20507
+ pdf | spn2-error:host-crawling-paused | 19900
+ pdf | bad-redirect | 19183
+ html | terminal-bad-status | 13354
+ component | blocked-cookie | 12287
+ component | spn2-error:500 | 11271
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 38144779
+ pdf | | 32762240
+ pdf | 301 | 9433087
+ html | 200 | 1716127
+ pdf | 403 | 1416632
+ pdf | 302 | 1134668
+ pdf | 404 | 888853
+ pdf | 401 | 746311
+ pdf | 503 | 655894
+ pdf | 400 | 531479
+ component | | 337603
+ pdf | 500 | 247944
+ html | 301 | 224237
+ html | | 167194
+ pdf | 303 | 135048
+ component | 200 | 130663
+ pdf | 429 | 93489
+ pdf | 410 | 67392
+ pdf | 420 | 26722
+ pdf | 502 | 18770
+ pdf | 409 | 15152
+ pdf | 509 | 15113
+ pdf | 999 | 11747
+ html | 404 | 9879
+ pdf | 307 | 8895
+ pdf | 412 | 7053
+ pdf | 308 | 6627
+ pdf | 202 | 5289
+ xml | 200 | 2540
+ html | 500 | 2480
+ pdf | 520 | 2220
+ pdf | 521 | 1844
+ pdf | 206 | 1739
+ html | 302 | 1407
+ pdf | 504 | 1146
+ html | 303 | 1123
+ pdf | 421 | 986
+ pdf | 406 | 938
+ pdf | 204 | 498
+ pdf | 505 | 468
+ pdf | 300 | 436
+ pdf | 508 | 422
+ pdf | 426 | 405
+ html | 429 | 402
+ html | 403 | 398
+ pdf | 432 | 366
+ component | 301 | 294
+ pdf | 405 | 210
+ pdf | 226 | 166
+ component | 302 | 128
+ (50 rows)
+
diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt
new file mode 100644
index 0000000..864abd4
--- /dev/null
+++ b/sql/stats/2022-04-27_crawl_changelog.txt
@@ -0,0 +1,191 @@
+ domain | status | count
+--------------------------------------+-------------------------+--------
+ academic.oup.com | | 1243
+ academic.oup.com | spn2-cdx-lookup-failure | 990
+ aip.scitation.org | | 313
+ aip.scitation.org | spn2-cdx-lookup-failure | 224
+ ajps.uomustansiriyah.edu.iq | | 235
+ apps.crossref.org | | 1329
+ apps.crossref.org | spn2-cdx-lookup-failure | 942
+ apps.crossref.org | no-pdf-link | 387
+ archaeologydataservice.ac.uk | | 422
+ archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289
+ arxiv.org | | 3512
+ arxiv.org | spn2-cdx-lookup-failure | 2319
+ arxiv.org | success | 1177
+ assets.researchsquare.com | | 571
+ assets.researchsquare.com | spn2-cdx-lookup-failure | 322
+ assets.researchsquare.com | success | 249
+ brill.com | | 397
+ brill.com | spn2-cdx-lookup-failure | 265
+ cla.berkeley.edu | | 239
+ classiques-garnier.com | | 249
+ cyberleninka.ru | | 340
+ cyberleninka.ru | spn2-cdx-lookup-failure | 244
+ dergipark.org.tr | | 468
+ dergipark.org.tr | spn2-cdx-lookup-failure | 333
+ dl.acm.org | | 592
+ dl.acm.org | spn2-cdx-lookup-failure | 470
+ doi.ala.org.au | | 288
+ doi.ala.org.au | spn2-cdx-lookup-failure | 220
+ doi.org | | 1107
+ doi.org | terminal-bad-status | 679
+ doi.org | spn2-cdx-lookup-failure | 415
+ downloads.hindawi.com | | 279
+ downloads.hindawi.com | success | 267
+ edbs.uomustansiriyah.edu.iq | | 294
+ edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209
+ elibrary.kdpu.edu.ua | | 320
+ elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233
+ elibrary.ru | | 722
+ elibrary.ru | spn2-cdx-lookup-failure | 505
+ europepmc.org | | 986
+ europepmc.org | spn2-cdx-lookup-failure | 681
+ europepmc.org | success | 291
+ figshare.com | | 377
+ figshare.com | spn2-cdx-lookup-failure | 328
+ fjfsdata01prod.blob.core.windows.net | | 255
+ fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216
+ hammer.purdue.edu | | 224
+ ieeexplore.ieee.org | | 3904
+ ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654
+ ieeexplore.ieee.org | gateway-timeout | 792
+ ieeexplore.ieee.org | spn2-backoff | 419
+ journals.eco-vector.com | | 428
+ journals.eco-vector.com | spn2-cdx-lookup-failure | 306
+ journals.lww.com | | 727
+ journals.lww.com | spn2-cdx-lookup-failure | 622
+ journals.openedition.org | | 806
+ journals.openedition.org | spn2-cdx-lookup-failure | 554
+ journals.plos.org | | 348
+ journals.plos.org | spn2-cdx-lookup-failure | 244
+ kiss.kstudy.com | | 226
+ kluwerlawonline.com | | 723
+ kluwerlawonline.com | spn2-cdx-lookup-failure | 489
+ kluwerlawonline.com | link-loop | 203
+ linkinghub.elsevier.com | | 401
+ linkinghub.elsevier.com | spn2-backoff | 342
+ mdpi-res.com | | 1463
+ mdpi-res.com | success | 1337
+ muse.jhu.edu | | 346
+ muse.jhu.edu | spn2-cdx-lookup-failure | 253
+ onepetro.org | | 363
+ onepetro.org | spn2-cdx-lookup-failure | 284
+ online.ucpress.edu | | 1620
+ online.ucpress.edu | spn2-cdx-lookup-failure | 1511
+ onlinelibrary.wiley.com | | 2913
+ onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109
+ onlinelibrary.wiley.com | terminal-bad-status | 787
+ opendata.uni-halle.de | | 519
+ opendata.uni-halle.de | spn2-cdx-lookup-failure | 343
+ osf.io | | 1554
+ osf.io | spn2-cdx-lookup-failure | 1350
+ papers.ssrn.com | | 2207
+ papers.ssrn.com | spn2-cdx-lookup-failure | 1727
+ papers.ssrn.com | link-loop | 457
+ psycharchives.org | | 384
+ psycharchives.org | spn2-cdx-lookup-failure | 283
+ publons.com | | 493
+ publons.com | spn2-cdx-lookup-failure | 348
+ pubs.acs.org | | 1240
+ pubs.acs.org | spn2-cdx-lookup-failure | 881
+ pubs.acs.org | terminal-bad-status | 298
+ pubs.rsc.org | | 603
+ pubs.rsc.org | spn2-cdx-lookup-failure | 460
+ repositories.lib.utexas.edu | | 1861
+ repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288
+ repositories.lib.utexas.edu | terminal-bad-status | 523
+ s3-eu-west-1.amazonaws.com | | 216
+ sage.figshare.com | | 374
+ sage.figshare.com | spn2-cdx-lookup-failure | 309
+ scholar.dkyobobook.co.kr | | 220
+ scholarworks.gsu.edu | | 749
+ scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577
+ tandf.figshare.com | | 214
+ www.atlantis-press.com | | 338
+ www.atlantis-press.com | spn2-cdx-lookup-failure | 214
+ www.cairn.info | | 782
+ www.cairn.info | spn2-cdx-lookup-failure | 541
+ www.cambridge.org | | 2325
+ www.cambridge.org | spn2-cdx-lookup-failure | 1787
+ www.cambridge.org | no-pdf-link | 300
+ www.cell.com | | 213
+ www.concrete.org | | 476
+ www.concrete.org | spn2-cdx-lookup-failure | 340
+ www.dbpia.co.kr | | 375
+ www.dbpia.co.kr | spn2-cdx-lookup-failure | 275
+ www.degruyter.com | | 3849
+ www.degruyter.com | spn2-cdx-lookup-failure | 2969
+ www.degruyter.com | no-pdf-link | 712
+ www.dib.ie | | 1100
+ www.dib.ie | spn2-cdx-lookup-failure | 1038
+ www.e-periodica.ch | | 821
+ www.e-periodica.ch | spn2-cdx-lookup-failure | 620
+ www.e-periodica.ch | no-pdf-link | 201
+ www.elibrary.ru | | 401
+ www.elibrary.ru | spn2-cdx-lookup-failure | 281
+ www.emerald.com | | 390
+ www.emerald.com | spn2-cdx-lookup-failure | 275
+ www.eurekaselect.com | | 275
+ www.frontiersin.org | | 1266
+ www.frontiersin.org | spn2-cdx-lookup-failure | 1025
+ www.hanspub.org | | 229
+ www.hindawi.com | | 604
+ www.hindawi.com | spn2-cdx-lookup-failure | 594
+ www.inderscience.com | | 201
+ www.jstage.jst.go.jp | | 1094
+ www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807
+ www.jstage.jst.go.jp | success | 206
+ www.mdpi.com | | 4340
+ www.mdpi.com | spn2-cdx-lookup-failure | 4258
+ www.nomos-elibrary.de | | 2749
+ www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909
+ www.nomos-elibrary.de | redirect-loop | 819
+ www.osti.gov | | 275
+ www.oxfordhandbooks.com | | 248
+ www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224
+ www.pdcnet.org | | 217
+ www.researchsquare.com | | 483
+ www.researchsquare.com | spn2-cdx-lookup-failure | 317
+ www.scielo.br | | 319
+ www.scielo.br | spn2-cdx-lookup-failure | 222
+ www.sciencedirect.com | | 3384
+ www.sciencedirect.com | spn2-cdx-lookup-failure | 3267
+ www.spiedigitallibrary.org | | 441
+ www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327
+ www.tandfonline.com | | 2401
+ www.tandfonline.com | spn2-cdx-lookup-failure | 1552
+ www.tandfonline.com | no-pdf-link | 303
+ www.tandfonline.com | blocked-cookie | 250
+ www.taylorfrancis.com | | 1232
+ www.taylorfrancis.com | spn2-cdx-lookup-failure | 908
+ www.thieme-connect.de | | 520
+ www.thieme-connect.de | spn2-cdx-lookup-failure | 366
+ www.worldscientific.com | | 383
+ www.worldscientific.com | spn2-cdx-lookup-failure | 276
+ zenodo.org | | 10625
+ zenodo.org | spn2-cdx-lookup-failure | 7777
+ zenodo.org | success | 1574
+ zenodo.org | no-pdf-link | 1160
+ zivahub.uct.ac.za | | 3428
+ zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845
+ zivahub.uct.ac.za | no-pdf-link | 583
+ | | 130491
+ | spn2-cdx-lookup-failure | 95169
+ | success | 13354
+ | no-pdf-link | 9621
+ | terminal-bad-status | 3385
+ | spn2-backoff | 2396
+ | redirect-loop | 2216
+ | link-loop | 1850
+ | gateway-timeout | 1061
+ | spn2-error:blocked-url | 428
+ | blocked-cookie | 415
+ | spn2-error | 246
+(182 rows)
+
+----
+
+The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a
+week or two, when crawling and retries are running smoothly, and see what
+things look like then.
diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt
new file mode 100644
index 0000000..8d98217
--- /dev/null
+++ b/sql/stats/2022-05-11_crawl_changelog.txt
@@ -0,0 +1,410 @@
+ domain | status | count
+-----------------------------------------------------------------+-------------------------+--------
+ academic.oup.com | | 2210
+ academic.oup.com | no-pdf-link | 1350
+ academic.oup.com | bad-redirect | 510
+ academiccommons.columbia.edu | | 379
+ academiccommons.columbia.edu | success | 339
+ aip.scitation.org | | 762
+ aip.scitation.org | terminal-bad-status | 430
+ apps.crossref.org | | 9894
+ apps.crossref.org | no-pdf-link | 9886
+ apps.euskadi.eus | | 242
+ apps.euskadi.eus | no-pdf-link | 240
+ arxiv.org | | 44889
+ arxiv.org | success | 28781
+ arxiv.org | spn2-backoff | 7975
+ arxiv.org | terminal-bad-status | 4508
+ arxiv.org | spn2-cdx-lookup-failure | 2010
+ arxiv.org | redirect-loop | 619
+ arxiv.org | no-pdf-link | 242
+ arxiv.org | spn2-error | 236
+ asa.scitation.org | | 356
+ asa.scitation.org | terminal-bad-status | 299
+ asmedigitalcollection.asme.org | | 240
+ assets.cureus.com | | 336
+ assets.cureus.com | success | 335
+ assets.researchsquare.com | | 1042
+ assets.researchsquare.com | success | 993
+ av.tib.eu | | 205
+ av.tib.eu | no-pdf-link | 203
+ bibliographie.uni-tuebingen.de | | 213
+ bibliographie.uni-tuebingen.de | no-pdf-link | 211
+ biorxiv.org | redirect-loop | 217
+ biorxiv.org | | 217
+ books.openedition.org | | 691
+ books.openedition.org | no-pdf-link | 687
+ boris.unibe.ch | | 525
+ boris.unibe.ch | success | 466
+ bridges.monash.edu | | 663
+ bridges.monash.edu | no-pdf-link | 647
+ brill.com | | 860
+ brill.com | success | 434
+ chemrxiv.org | | 201
+ classiques-garnier.com | | 242
+ content.iospress.com | | 325
+ content.iospress.com | link-loop | 247
+ core.tdar.org | | 216
+ core.tdar.org | no-pdf-link | 211
+ cyberleninka.ru | | 646
+ cyberleninka.ru | success | 620
+ d197for5662m48.cloudfront.net | | 263
+ d197for5662m48.cloudfront.net | success | 262
+ dergipark.org.tr | | 891
+ dergipark.org.tr | success | 526
+ dergipark.org.tr | no-pdf-link | 261
+ digi.ub.uni-heidelberg.de | | 427
+ digi.ub.uni-heidelberg.de | no-pdf-link | 427
+ direct.mit.edu | | 268
+ direct.mit.edu | no-pdf-link | 208
+ dl.acm.org | | 1719
+ dl.acm.org | success | 829
+ dl.acm.org | no-pdf-link | 546
+ dl.acm.org | terminal-bad-status | 205
+ dlc.library.columbia.edu | | 385
+ dlc.library.columbia.edu | terminal-bad-status | 319
+ doi.ala.org.au | | 724
+ doi.ala.org.au | no-pdf-link | 721
+ doi.apa.org | | 214
+ doi.org | | 3390
+ doi.org | terminal-bad-status | 2938
+ doi.org | redirect-loop | 233
+ doi.org | spn2-wayback-error | 208
+ doi.usp.org | | 325
+ doi.usp.org | no-pdf-link | 324
+ downloads.hindawi.com | | 1439
+ downloads.hindawi.com | success | 1436
+ du.diva-portal.org | | 589
+ du.diva-portal.org | success | 586
+ econtents.bc.unicamp.br | | 310
+ econtents.bc.unicamp.br | success | 310
+ ediss.uni-goettingen.de | | 728
+ ediss.uni-goettingen.de | success | 425
+ elibrary.kdpu.edu.ua | | 907
+ elibrary.kdpu.edu.ua | bad-redirect | 712
+ elibrary.ru | | 925
+ elibrary.ru | terminal-bad-status | 492
+ elibrary.ru | bad-redirect | 230
+ elibrary.vdi-verlag.de | | 393
+ elifesciences.org | | 296
+ elifesciences.org | success | 276
+ europepmc.org | | 3024
+ europepmc.org | success | 2541
+ europepmc.org | terminal-bad-status | 463
+ figshare.com | | 493
+ figshare.com | no-pdf-link | 440
+ files.osf.io | | 883
+ files.osf.io | success | 686
+ fjfsdata01prod.blob.core.windows.net | | 3869
+ fjfsdata01prod.blob.core.windows.net | success | 3818
+ ieeexplore.ieee.org | | 10854
+ ieeexplore.ieee.org | gateway-timeout | 5495
+ ieeexplore.ieee.org | spn2-backoff | 1662
+ ieeexplore.ieee.org | no-pdf-link | 1417
+ ieeexplore.ieee.org | success | 1410
+ ieeexplore.ieee.org | redirect-loop | 768
+ iiif.crossasia.org | | 7608
+ iiif.crossasia.org | no-pdf-link | 7568
+ ikee.lib.auth.gr | | 450
+ ikee.lib.auth.gr | success | 332
+ ins.journals.ekb.eg | | 212
+ iopscience.iop.org | | 268
+ jamanetwork.com | | 333
+ journals.aps.org | | 414
+ journals.asm.org | | 242
+ journals.flvc.org | | 245
+ journals.flvc.org | success | 242
+ journals.healio.com | | 755
+ journals.healio.com | terminal-bad-status | 668
+ journals.lincoln.ac.nz | | 244
+ journals.lincoln.ac.nz | success | 239
+ journals.lww.com | | 1772
+ journals.lww.com | link-loop | 1425
+ journals.lww.com | spn2-backoff | 209
+ journals.openedition.org | | 1192
+ journals.openedition.org | redirect-loop | 467
+ journals.openedition.org | success | 451
+ journals.plos.org | | 771
+ journals.plos.org | success | 750
+ journals.ub.uni-heidelberg.de | | 787
+ journals.ub.uni-heidelberg.de | success | 741
+ kazanmedjournal.ru | | 240
+ kazanmedjournal.ru | success | 231
+ kiss.kstudy.com | | 219
+ kiss.kstudy.com | no-pdf-link | 218
+ kluwerlawonline.com | | 444
+ kluwerlawonline.com | link-loop | 402
+ libraetd.lib.virginia.edu | | 362
+ libraetd.lib.virginia.edu | no-pdf-link | 361
+ link.springer.com | | 305
+ linkinghub.elsevier.com | | 568
+ linkinghub.elsevier.com | spn2-backoff | 545
+ ltu-figshare-repo.s3.aarnet.edu.au | | 269
+ ltu-figshare-repo.s3.aarnet.edu.au | success | 268
+ mausamjournal.imd.gov.in | | 202
+ mdpi-res.com | | 8892
+ mdpi-res.com | success | 8863
+ mededpublish.org | | 1900
+ mededpublish.org | no-pdf-link | 1900
+ meetingorganizer.copernicus.org | | 276
+ meetingorganizer.copernicus.org | no-pdf-link | 271
+ muse.jhu.edu | | 1047
+ muse.jhu.edu | terminal-bad-status | 755
+ muse.jhu.edu | link-loop | 203
+ online.ucpress.edu | | 358
+ online.ucpress.edu | link-loop | 212
+ onlinelibrary.wiley.com | | 5813
+ onlinelibrary.wiley.com | terminal-bad-status | 4587
+ onlinelibrary.wiley.com | spn2-wayback-error | 614
+ onlinelibrary.wiley.com | blocked-cookie | 381
+ open.library.ubc.ca | | 206
+ opendata.uni-halle.de | | 1768
+ opendata.uni-halle.de | success | 1215
+ opendata.uni-halle.de | wrong-mimetype | 260
+ opendata2.uni-halle.de | | 206
+ opg.optica.org | | 205
+ osf.io | | 2949
+ osf.io | no-pdf-link | 2404
+ osf.io | spn2-backoff | 299
+ papers.ssrn.com | | 3962
+ papers.ssrn.com | link-loop | 3800
+ peerj.com | | 273
+ preprints.jmir.org | | 275
+ preprints.jmir.org | cdx-error | 255
+ publikationen.bibliothek.kit.edu | | 213
+ publons.com | | 593
+ publons.com | no-pdf-link | 590
+ pubs.acs.org | | 2288
+ pubs.acs.org | terminal-bad-status | 1841
+ pubs.acs.org | spn2-wayback-error | 210
+ pubs.rsc.org | | 1698
+ pubs.rsc.org | bad-redirect | 811
+ pubs.rsc.org | link-loop | 352
+ pubs.rsc.org | success | 307
+ radiopaedia.org | | 220
+ read.dukeupress.edu | | 303
+ repositories.lib.utexas.edu | | 1570
+ repositories.lib.utexas.edu | bad-redirect | 513
+ repositories.lib.utexas.edu | spn2-backoff | 383
+ repositories.lib.utexas.edu | gateway-timeout | 379
+ repositories.lib.utexas.edu | terminal-bad-status | 282
+ repository.uj.ac.za | | 489
+ repository.uj.ac.za | no-pdf-link | 365
+ repository.unsworks.unsw.edu.au | | 397
+ repository.urosario.edu.co | | 2429
+ repository.urosario.edu.co | success | 1648
+ repository.urosario.edu.co | bad-redirect | 613
+ rex.libraries.wsu.edu | no-pdf-link | 241
+ rex.libraries.wsu.edu | | 241
+ rsdjournal.org | | 208
+ rsdjournal.org | success | 208
+ s3-ap-southeast-2.amazonaws.com | | 282
+ s3-ap-southeast-2.amazonaws.com | success | 277
+ s3-eu-west-1.amazonaws.com | | 4615
+ s3-eu-west-1.amazonaws.com | success | 4593
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240
+ s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237
+ sage.figshare.com | | 415
+ sage.figshare.com | no-pdf-link | 385
+ scholar.dkyobobook.co.kr | | 512
+ scholar.dkyobobook.co.kr | no-pdf-link | 509
+ scholarlypublishingcollective.org | | 287
+ scholarworks.gsu.edu | | 1132
+ scholarworks.gsu.edu | success | 1000
+ scholarworks.iupui.edu | | 205
+ scholarworks.umass.edu | | 417
+ scholarworks.umass.edu | success | 400
+ sciencescholar.us | | 404
+ secure.jbs.elsevierhealth.com | | 727
+ secure.jbs.elsevierhealth.com | terminal-bad-status | 722
+ tandf.figshare.com | | 354
+ tandf.figshare.com | no-pdf-link | 342
+ unsworks.unsw.edu.au | | 408
+ unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342
+ valep.vc.univie.ac.at | no-pdf-link | 737
+ valep.vc.univie.ac.at | | 737
+ watermark.silverchair.com | | 1604
+ watermark.silverchair.com | success | 1598
+ wayf.switch.ch | | 215
+ wayf.switch.ch | no-pdf-link | 213
+ www.ahajournals.org | | 438
+ www.ahajournals.org | no-pdf-link | 306
+ www.ahbps.org | | 316
+ www.ahbps.org | success | 312
+ www.atenaeditora.com.br | | 390
+ www.atenaeditora.com.br | terminal-bad-status | 333
+ www.atlantis-press.com | | 914
+ www.atlantis-press.com | success | 901
+ www.atsjournals.org | | 1245
+ www.atsjournals.org | success | 1189
+ www.biorxiv.org | | 712
+ www.biorxiv.org | success | 670
+ www.bloomsburycollections.com | | 982
+ www.bloomsburycollections.com | terminal-bad-status | 566
+ www.cahiers-clsl.ch | | 305
+ www.cahiers-clsl.ch | success | 298
+ www.cairn.info | | 1799
+ www.cairn.info | no-pdf-link | 662
+ www.cairn.info | link-loop | 487
+ www.cairn.info | success | 355
+ www.cairn.info | terminal-bad-status | 267
+ www.cambridge.org | | 3258
+ www.cambridge.org | no-pdf-link | 1682
+ www.cambridge.org | success | 682
+ www.cambridge.org | bad-redirect | 404
+ www.cambridge.org | link-loop | 302
+ www.dbpia.co.kr | | 763
+ www.dbpia.co.kr | no-pdf-link | 443
+ www.dbpia.co.kr | redirect-loop | 287
+ www.degruyter.com | | 12655
+ www.degruyter.com | no-pdf-link | 9112
+ www.degruyter.com | success | 2898
+ www.degruyter.com | spn2-backoff | 507
+ www.dib.ie | | 1381
+ www.dib.ie | no-pdf-link | 1378
+ www.dovepress.com | | 231
+ www.dovepress.com | success | 216
+ www.e-manuscripta.ch | | 767
+ www.e-manuscripta.ch | success | 399
+ www.e-periodica.ch | | 1406
+ www.e-periodica.ch | no-pdf-link | 1402
+ www.e-rara.ch | no-pdf-link | 251
+ www.e-rara.ch | | 251
+ www.editoracientifica.org | no-pdf-link | 205
+ www.editoracientifica.org | | 205
+ www.elgaronline.com | | 427
+ www.elibrary.ru | | 616
+ www.elibrary.ru | terminal-bad-status | 364
+ www.elibrary.ru | no-pdf-link | 216
+ www.emerald.com | | 862
+ www.emerald.com | no-pdf-link | 724
+ www.endocrine-abstracts.org | | 1907
+ www.endocrine-abstracts.org | no-pdf-link | 1905
+ www.eurekaselect.com | | 285
+ www.eurekaselect.com | link-loop | 246
+ www.even3.com.br | | 233
+ www.frontiersin.org | | 585
+ www.frontiersin.org | spn2-backoff | 436
+ www.humankineticslibrary.com | no-pdf-link | 207
+ www.humankineticslibrary.com | | 207
+ www.igi-global.com | | 1600
+ www.igi-global.com | no-pdf-link | 1199
+ www.igi-global.com | bad-redirect | 258
+ www.inderscience.com | | 385
+ www.inderscience.com | no-pdf-link | 365
+ www.inderscienceonline.com | | 202
+ www.ingentaconnect.com | | 450
+ www.ingentaconnect.com | no-pdf-link | 260
+ www.jstage.jst.go.jp | | 1248
+ www.jstage.jst.go.jp | success | 870
+ www.karger.com | | 313
+ www.liebertpub.com | | 271
+ www.liebertpub.com | no-pdf-link | 241
+ www.nicecjournal.co.uk | | 274
+ www.nicecjournal.co.uk | success | 274
+ www.nomos-elibrary.de | | 1771
+ www.nomos-elibrary.de | no-pdf-link | 788
+ www.nomos-elibrary.de | redirect-loop | 506
+ www.nomos-elibrary.de | bad-redirect | 207
+ www.osti.gov | | 381
+ www.osti.gov | link-loop | 326
+ www.persee.fr | | 277
+ www.preprints.org | | 225
+ www.preprints.org | success | 225
+ www.protocols.io | | 770
+ www.protocols.io | success | 485
+ www.repository.cam.ac.uk | | 510
+ www.repository.cam.ac.uk | bad-redirect | 213
+ www.research-collection.ethz.ch | | 416
+ www.research-collection.ethz.ch | bad-redirect | 249
+ www.researchsquare.com | | 1121
+ www.researchsquare.com | bad-redirect | 985
+ www.scielo.br | | 828
+ www.scielo.br | success | 641
+ www.sciencedirect.com | | 8567
+ www.sciencedirect.com | terminal-bad-status | 5773
+ www.sciencedirect.com | spn2-wayback-error | 1590
+ www.sciencedirect.com | no-pdf-link | 576
+ www.sciencedirect.com | spn2-backoff | 479
+ www.sciendo.com | | 257
+ www.sciendo.com | success | 222
+ www.scitepress.org | | 381
+ www.scitepress.org | no-pdf-link | 377
+ www.spiedigitallibrary.org | | 1061
+ www.spiedigitallibrary.org | bad-redirect | 571
+ www.spiedigitallibrary.org | gateway-timeout | 233
+ www.tandfonline.com | | 4934
+ www.tandfonline.com | no-pdf-link | 2088
+ www.tandfonline.com | terminal-bad-status | 1282
+ www.tandfonline.com | blocked-cookie | 757
+ www.tandfonline.com | redirect-loop | 488
+ www.tandfonline.com | spn2-wayback-error | 202
+ www.taylorfrancis.com | | 3979
+ www.taylorfrancis.com | link-loop | 1928
+ www.taylorfrancis.com | no-pdf-link | 1840
+ www.techniques-ingenieur.fr | | 354
+ www.techniques-ingenieur.fr | no-pdf-link | 353
+ www.thieme-connect.de | | 1987
+ www.thieme-connect.de | no-pdf-link | 949
+ www.thieme-connect.de | link-loop | 869
+ www.tib.eu | no-pdf-link | 315
+ www.tib.eu | | 315
+ www.un-ilibrary.org | no-pdf-link | 352
+ www.un-ilibrary.org | | 352
+ www.worldscientific.com | | 668
+ www.worldscientific.com | no-pdf-link | 629
+ www.zora.uzh.ch | | 318
+ zenodo.org | | 46585
+ zenodo.org | no-pdf-link | 29519
+ zenodo.org | success | 14768
+ zenodo.org | terminal-bad-status | 810
+ zenodo.org | wrong-mimetype | 691
+ zenodo.org | spn2-cdx-lookup-failure | 395
+ zenodo.org | spn2-backoff | 294
+ zivahub.uct.ac.za | | 1909
+ zivahub.uct.ac.za | no-pdf-link | 1880
+ zop.zb.uzh.ch | | 228
+ zop.zb.uzh.ch | success | 217
+ | | 365582
+ | success | 141497 38.7%
+ | no-pdf-link | 120852 33.0%
+ | terminal-bad-status | 31900 8.7%
+ | spn2-backoff | 16979 4.6%
+ | link-loop | 13624 3.7%
+ | bad-redirect | 8736
+ | redirect-loop | 7405
+ | gateway-timeout | 6997
+ | spn2-cdx-lookup-failure | 5146
+ | spn2-wayback-error | 3708
+ | wrong-mimetype | 2158
+ | blocked-cookie | 1942
+ | spn2-error:blocked-url | 1733
+ | wayback-error | 1063
+ | spn2-error | 647
+ | spn2-error:500 | 265
+ | cdx-error | 257
+(383 rows)
+
+----
+
+365k in 7 days is about 52k a day, which is about expected. Around 5-7% need
+retries.
+
+important changes:
+- biorxiv.org: needs fix and then retries
+- academic.oup.com: should probably skip
+- apps.crossref.org: need to handle this in code
+- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/`
+- doi.org: investigate redirect-loop and terminal-bad-status
+- osf.io: not getting PDFs
+- papers.ssrn.com: why are these attempted?
+- publons.com: not getting PDFs; special case these?
+- www.sciencedirect.com: not working at all?
+
+smaller:
+- bridges.monash.edu: fix, then retry?
+- dl.acm.org: some broader retries?
+- figshare.com: still some attempts, but almost all no-pdf-link
+- onlinelibrary.wiley.com: getting blocked broadly?
+- www.endocrine-abstracts.org: HTML content?
+- www.igi-global.com: no-pdf-link
diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt
new file mode 100644
index 0000000..be2b30c
--- /dev/null
+++ b/sql/stats/2022-09-06_stats.txt
@@ -0,0 +1,438 @@
+
+## SQL Table Sizes
+
+ SELECT
+ table_name,
+ pg_size_pretty(table_size) AS table_size,
+ pg_size_pretty(indexes_size) AS indexes_size,
+ pg_size_pretty(total_size) AS total_size
+ FROM (
+ SELECT
+ table_name,
+ pg_table_size(table_name) AS table_size,
+ pg_indexes_size(table_name) AS indexes_size,
+ pg_total_relation_size(table_name) AS total_size
+ FROM (
+ SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name
+ FROM information_schema.tables
+ WHERE table_schema = 'public'
+ ) AS all_tables
+ ORDER BY total_size DESC
+ ) AS pretty_sizes;
+
+
+ table_name | table_size | indexes_size | total_size
+ ------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 62 GB | 44 GB | 106 GB
+ "public"."ingest_request" | 51 GB | 50 GB | 101 GB
+ "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB
+ "public"."file_meta" | 39 GB | 39 GB | 78 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7469 MB | 66 MB | 7535 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes
+ (16 rows)
+
+
+## File Metadata
+
+Counts and total file size:
+
+ SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta;
+
+ total_count | total_size
+ -------------+-----------------
+ 198175106 | 282695671015403
+ (1 row)
+
+ 198 million files, 282 TBytes.
+
+Top mimetypes:
+
+ SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30;
+
+ mimetype | count
+ ---------------------------------------------------------------------------+-----------
+ application/pdf | 197021437
+ text/html | 830331
+ application/octet-stream | 186669
+ application/xml | 42170
+ application/xhtml+xml | 38207
+ text/plain | 16471
+ application/jats+xml | 10385
+ application/gzip | 6681
+ | 6032
+ application/postscript | 4916
+ image/jpeg | 4522
+ application/vnd.ms-powerpoint | 1672
+ application/msword | 946
+ application/x-bzip2 | 891
+ image/png | 659
+ application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440
+ application/x-dosexec | 404
+ image/gif | 395
+ application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382
+ application/x-compress | 274
+ video/mp4 | 218
+ application/zip | 131
+ application/CDFV2-unknown | 99
+ application/mac-binhex40 | 79
+ application/zlib | 68
+ text/x-tex | 44
+ application/vnd.openxmlformats-officedocument.presentationml.presentation | 39
+ text/x-php | 37
+ image/g3fax | 35
+ text/rtf | 33
+ (30 rows)
+
+Missing full metadata:
+
+ SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL;
+
+ count
+ -------
+ 12800
+ (1 row)
+
+## CDX
+
+Total and unique-by-sha1 counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx;
+
+ unique_sha1 | total
+ -------------+-----------
+ 137283420 | 172140506
+ (1 row)
+
+mimetype counts:
+
+ SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30;
+
+ mimetype | count
+ ----------------------------+-----------
+ application/pdf | 157465613
+ warc/revisit | 11337336
+ text/html | 1137208
+ application/octet-stream | 950380
+ text/xml | 528965
+ unk | 253294
+ application/postscript | 81130
+ application/save | 81069
+ binary/octet-stream | 68942
+ application/x-download | 42717
+ application/download | 40628
+ image/pdf | 39904
+ text/plain | 36445
+ application/force-download | 24148
+ multipart/form-data | 10972
+ application | 5409
+ application/x-octetstream | 5192
+ application/x-msdownload | 3854
+ .pdf | 3518
+ application/x-pdf | 3061
+ application/octet | 1792
+ pdf | 1757
+ application/binary | 1399
+ file | 1373
+ file/unknown | 1345
+ application/pdf' | 1196
+ application/octetstream | 1087
+ application/unknown | 1005
+ 0 | 773
+ text/pdf | 729
+ (30 rows)
+
+## GROBID
+
+Counts:
+
+ SELECT COUNT(*) AS total_files FROM grobid;
+
+ total_files
+ -------------
+ 129001717
+ (1 row)
+
+Status?
+
+ SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25;
+
+ status_code | count
+ -------------+-----------
+ 200 | 120797098
+ 500 | 8198783
+ -4 | 5802
+ 503 | 36
+ (4 rows)
+
+What version used?
+
+ SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25;
+
+ grobid_version | count
+ ----------------------+----------
+ 0.7.0-131-gdd0251d9f | 60469462
+ 0.5.5-fatcat | 47472904
+ | 12665498
+ 0.7.0-104-gbeebd9a6b | 189243
+ (4 rows)
+
+
+## Petabox
+
+Counts:
+
+ SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox;
+
+ unique_sha1 | total
+ -------------+---------
+ 2868825 | 2887834
+ (1 row)
+
+## Ingests
+
+Requests by source:
+
+ SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25;
+
+ ingest_type | link_source | count
+ -------------+-----------------+----------
+ pdf | oai | 51185088
+ pdf | unpaywall | 43932525
+ pdf | doi | 43852308
+ pdf | mag | 43701948
+ pdf | doaj | 6534341
+ html | doaj | 3987669
+ pdf | arxiv | 2784589
+ pdf | pmc | 2439181
+ pdf | dblp | 631716
+ html | doi | 126699
+ xml | doaj | 23066
+ pdf | cnki_covid19 | 2034
+ pdf | spn | 1026
+ pdf | wanfang_covid19 | 975
+ html | spn | 65
+ xml | spn | 2
+ xml | doi | 1
+ (17 rows)
+
+ SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | ingest_request_source | count
+ -------------+-----------------+-------------------------+----------
+ pdf | oai | metha-bulk | 51185088
+ pdf | unpaywall | unpaywall | 43932525
+ pdf | mag | mag-corpus | 43701948
+ pdf | doi | fatcat-changelog | 24742500
+ pdf | doi | fatcat-ingest | 15592121
+ pdf | doaj | doaj | 6484737
+ html | doaj | doaj | 3987468
+ pdf | doi | fatcat-ingest-container | 3515873
+ pdf | pmc | fatcat-ingest-container | 2028825
+ pdf | arxiv | fatcat-ingest | 1984766
+ pdf | arxiv | fatcat-changelog | 799793
+ pdf | dblp | dblp | 631716
+ pdf | pmc | fatcat-ingest | 297980
+ html | doi | fatcat-ingest | 121508
+ pdf | pmc | fatcat-changelog | 112376
+ pdf | doaj | fatcat-changelog | 47181
+ xml | doaj | doaj | 23066
+ html | doi | fatcat-changelog | 5129
+ pdf | doaj | fatcat-ingest | 2423
+ pdf | cnki_covid19 | scrape-covid19 | 2034
+ pdf | doi | savepapernow-web | 1814
+ pdf | spn | savepapernow-web | 1026
+ pdf | wanfang_covid19 | scrape-covid19 | 975
+ html | doaj | fatcat-ingest | 201
+ html | spn | savepapernow-web | 65
+ html | doi | savepapernow-web | 62
+ pdf | arxiv | fatcat-ingest-container | 26
+ pdf | arxiv | savepapernow-web | 4
+ xml | spn | savepapernow-web | 2
+ xml | doi | savepapernow-web | 1
+ (30 rows)
+
+Uncrawled requests by source:
+
+ # TODO: verify this?
+ SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*)
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ WHERE ingest_file_result.base_url IS NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35;
+
+
+ ingest_type | link_source | count
+ -------------+-------------+--------
+ pdf | mag | 167653
+ pdf | doaj | 81517
+ pdf | oai | 15282
+ html | doaj | 1791
+ pdf | unpaywall | 270
+ pdf | doi | 22
+ (6 rows)
+
+Results by source:
+
+ SELECT
+ ingest_request.ingest_type,
+ ingest_request.link_source,
+ COUNT(*) as attempts,
+ COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,
+ ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction
+ FROM ingest_request
+ LEFT JOIN ingest_file_result
+ ON ingest_request.base_url = ingest_file_result.base_url
+ AND ingest_request.ingest_type = ingest_file_result.ingest_type
+ AND ingest_file_result.ingest_type IS NOT NULL
+ GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35;
+
+
+ ingest_type | link_source | attempts | hits | fraction
+ -------------+-----------------+----------+----------+----------
+ pdf | oai | 51185088 | 16024068 | 0.313
+ pdf | unpaywall | 43932525 | 36045446 | 0.820
+ pdf | doi | 43852308 | 14956080 | 0.341
+ pdf | mag | 43701948 | 32768484 | 0.750
+ pdf | doaj | 6534341 | 4704066 | 0.720
+ html | doaj | 3987669 | 778165 | 0.195
+ pdf | arxiv | 2784589 | 2419941 | 0.869
+ pdf | pmc | 2439181 | 1897671 | 0.778
+ pdf | dblp | 631716 | 305142 | 0.483
+ html | doi | 126699 | 75754 | 0.598
+ xml | doaj | 23066 | 10381 | 0.450
+ pdf | cnki_covid19 | 2034 | 0 | 0.000
+ pdf | spn | 1026 | 778 | 0.758
+ pdf | wanfang_covid19 | 975 | 764 | 0.784
+ html | spn | 65 | 13 | 0.200
+ xml | spn | 2 | 1 | 0.500
+ xml | doi | 1 | 0 | 0.000
+ (17 rows)
+
+Ingest result by status:
+
+ SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | status | count
+ -------------+-------------------------------+----------
+ pdf | success | 94887295
+ pdf | no-pdf-link | 33960080
+ pdf | no-capture | 20893916
+ pdf | terminal-bad-status | 6973765
+ pdf | redirect-loop | 5775175
+ pdf | link-loop | 4095424
+ pdf | skip-url-blocklist | 4037518
+ pdf | blocked-cookie | 3508762
+ html | wrong-scope | 1783694
+ pdf | wrong-mimetype | 1379673
+ html | success | 853762
+ pdf | gateway-timeout | 635170
+ html | no-capture | 381283
+ pdf | wayback-content-error | 356694
+ pdf | cdx-error | 347700
+ pdf | null-body | 336166
+ html | unknown-scope | 321874
+ html | html-resource-no-capture | 294294
+ pdf | forbidden | 291127
+ pdf | not-found | 274343
+ pdf | too-many-redirects | 264494
+ component | wrong-mimetype | 196680
+ component | spn2-cdx-lookup-failure | 173615
+ component | spn2-backoff | 115840
+ html | terminal-bad-status | 106264
+ html | null-body | 100296
+ pdf | wayback-error | 94748
+ html | blocked-cookie | 88537
+ component | no-capture | 75278
+ pdf | empty-blob | 61157
+ pdf | bad-redirect | 58680
+ pdf | skip-wall | 57751
+ pdf | spn2-error:too-many-redirects | 52873
+ html | spn2-backoff | 50577
+ pdf | remote-server-error | 41282
+ pdf | invalid-host-resolution | 38864
+ pdf | read-timeout | 37071
+ pdf | spn2-cdx-lookup-failure | 34229
+ html | wrong-mimetype | 33643
+ pdf | spn2-backoff | 32437
+ pdf | petabox-error | 31006
+ html | wayback-content-error | 28034
+ component | spn2-error | 27044
+ pdf | spn2-error:unknown | 25810
+ component | gateway-timeout | 25215
+ pdf | body-too-large | 21721
+ html | petabox-error | 18313
+ html | empty-blob | 14393
+ html | redirect-loop | 13404
+ component | blocked-cookie | 12287
+ (50 rows)
+
+Failed ingest by terminal status code:
+
+ SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50;
+
+ ingest_type | terminal_status_code | count
+ -------------+----------------------+----------
+ pdf | 200 | 45052391
+ pdf | | 26117481
+ pdf | 301 | 4814786
+ html | 200 | 2684821
+ pdf | 403 | 1871088
+ pdf | 404 | 1254259
+ pdf | 302 | 898728
+ pdf | 503 | 867548
+ pdf | 401 | 851205
+ pdf | 429 | 741869
+ pdf | 400 | 624519
+ component | | 456915
+ html | | 442051
+ pdf | 500 | 283700
+ component | 200 | 197510
+ pdf | 410 | 120647
+ pdf | 303 | 107947
+ html | 404 | 80114
+ pdf | 420 | 26722
+ pdf | 502 | 19500
+ pdf | 409 | 15499
+ html | 429 | 15208
+ pdf | 509 | 15167
+ pdf | 999 | 12186
+ pdf | 202 | 11535
+ html | 301 | 10213
+ xml | | 10018
+ pdf | 307 | 8657
+ pdf | 402 | 8338
+ pdf | 412 | 8064
+ pdf | 308 | 6479
+ html | 500 | 4746
+ xml | 200 | 2668
+ pdf | 520 | 2496
+ html | 302 | 2289
+ pdf | 521 | 2257
+ html | 202 | 2177
+ pdf | 206 | 1961
+ html | 403 | 1775
+ pdf | 504 | 1187
+ pdf | 421 | 1148
+ html | 303 | 1112
+ pdf | 406 | 1109
+ pdf | 204 | 772
+ pdf | 432 | 745
+ pdf | 405 | 633
+ html | 400 | 632
+ pdf | 426 | 515
+ pdf | 508 | 503
+ pdf | 505 | 469
+ (50 rows)
diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt
new file mode 100644
index 0000000..0a6254a
--- /dev/null
+++ b/sql/stats/2022-11-23_table_sizes.txt
@@ -0,0 +1,21 @@
+PostgreSQL 13.2 - wbgrp-svc506.us.archive.org
+Size: 1.13T
+
+ table_name | table_size | indexes_size | total_size
+------------------------------------+------------+--------------+------------
+ "public"."crossref" | 459 GB | 10 GB | 470 GB
+ "public"."grobid" | 98 GB | 13 GB | 112 GB
+ "public"."cdx" | 63 GB | 45 GB | 108 GB
+ "public"."ingest_request" | 53 GB | 52 GB | 105 GB
+ "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB
+ "public"."file_meta" | 39 GB | 40 GB | 79 GB
+ "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB
+ "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB
+ "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB
+ "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB
+ "public"."shadow" | 9517 MB | 8026 MB | 17 GB
+ "public"."html_meta" | 7879 MB | 68 MB | 7947 MB
+ "public"."petabox" | 403 MB | 461 MB | 864 MB
+ "public"."pdftrio" | 550 MB | 297 MB | 847 MB
+ "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB
+ "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes