diff options
Diffstat (limited to 'sql')
37 files changed, 3265 insertions, 192 deletions
diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..860addb --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,35 @@ + +SHELL=/bin/bash -euo pipefail +TODAY ?= $(shell date --iso --utc) +DATADIR ?= /srv/sandcrawler/tasks/$(TODAY) +DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) +DATABASE_URL ?= sandcrawler + +.PHONY: help +help: ## Print info about all commands + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: create_datadir +create_datadir: + mkdir -p $(DATADIR)/ + sudo chmod a+rw $(DATADIR)/ + +$(DATADIR)/.DB_DUMP: + sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip + mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump + touch $@ + +.PHONY: database-snapshot +database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot + @echo + +$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP + ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))" + ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump + touch $@ + +.PHONY: upload-database-snapshot +upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org + @echo diff --git a/sql/README.md b/sql/README.md index 42dba31..e488006 100644 --- a/sql/README.md +++ b/sql/README.md @@ -139,10 +139,30 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 -## Full Database Dumps -Run a dump in compressed, postgres custom format: +## Full SQL Database Dumps + +Run a dump in compressed, postgres custom format, not including `crossref` table (which is large and redundant): export DATESLUG="`date +%Y-%m-%d.%H%M%S`" - time sudo -u postgres pg_dump --verbose --format=custom sandcrawler > /sandcrawler-db/snapshots/sandcrawler_full_dbdump_${DATESLUG}.pgdump + time sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump + +As of 2021-12-03, this process runs for about 6 hours and the compressed +snapshot is 102 GBytes (compared with 940GB database disk consumption, +including crossref). + +Then, upload to petabox as a backup: + + ia upload sandcrawler_full_dbdump_YYYY-MM-DD -m mediatype:data -m collection:webgroup-internal-backups -m title:"Sandcrawler SQL Dump (YYYY-MM-DD)" sandcrawler_full_dbdump_${DATESLUG}.pgdump + + +## SQL Database Restore + +To restore a dump (which will delete local database content, if any): + + sudo su postgres + createuser --no-login web_anon + createuser -s sandcrawler + time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump +Took about 2.5 hours. diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md index f1a5f86..4a56065 100644 --- a/sql/backfill/backfill.md +++ b/sql/backfill/backfill.md @@ -76,6 +76,19 @@ In psql: COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); # => COPY 24727350 +In 2021-11-26: + + zcat file_export.json.gz \ + | pv -l \ + | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \ + | sort -S 8G \ + | uniq -w 40 \ + | pigz \ + > fatcat_file.2021-11-26.tsv.gz + + COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 112086814 + ## `file_meta` zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql index 1028c13..a7d6c2b 100644 --- a/sql/dump_file_meta.sql +++ b/sql/dump_file_meta.sql @@ -6,7 +6,7 @@ COPY ( FROM file_meta ORDER BY sha1hex ASC ) -TO '/grande/snapshots/file_meta_dump.tsv' +TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql index 3ca8085..e7c48f3 100644 --- a/sql/dump_regrobid_pdf_petabox.sql +++ b/sql/dump_regrobid_pdf_petabox.sql @@ -9,7 +9,7 @@ COPY ( SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) ) -TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json' +TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql new file mode 100644 index 0000000..698db7a --- /dev/null +++ b/sql/dump_reingest_bulk.sql @@ -0,0 +1,31 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '24 hour'::INTERVAL + AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql new file mode 100644 index 0000000..7473420 --- /dev/null +++ b/sql/dump_reingest_old.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 day'::INTERVAL + -- AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status like 'no-capture' + -- OR ingest_file_result.status like 'cdx-error' + -- OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql index 303824b..dbeb199 100644 --- a/sql/dump_reingest_quarterly.sql +++ b/sql/dump_reingest_quarterly.sql @@ -1,78 +1,47 @@ -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '91 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'spn2-%' - AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' - AND ingest_file_result.status != 'spn2-error:filesize-limit' - AND ingest_file_result.status != 'spn2-wayback-error' -) TO '/grande/snapshots/reingest_quarterly_spn2-error_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '91 day'::INTERVAL - AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_cdx-error_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '91 day'::INTERVAL - AND (ingest_request.ingest_request_source != 'fatcat-changelog' - AND ingest_request.ingest_request_source != 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'wayback-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '91 day'::INTERVAL -) TO '/grande/snapshots/reingest_quarterly_wayback-error_current.rows.json'; +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') AND ingest_file_result.hit = false - AND ingest_file_result.status like 'gateway-timeout' AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '91 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_gateway-timeout.rows.json'; + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_quarterly_current.rows.json'; -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'petabox-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '91 day'::INTERVAL - AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_quarterly_petabox-error_current.rows.json'; +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql new file mode 100644 index 0000000..a83125c --- /dev/null +++ b/sql/dump_reingest_spn.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created > NOW() - '180 day'::INTERVAL + AND ingest_request.ingest_request_source = 'savepapernow-web' + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql new file mode 100644 index 0000000..b72a096 --- /dev/null +++ b/sql/dump_reingest_terminalstatus.sql @@ -0,0 +1,34 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '72 hour'::INTERVAL + AND ingest_request.created > NOW() - '10 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 404 + ) + AND ( + ingest_request.base_url LIKE 'https://doi.org/10.3390/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' + ) +) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql index 28547a4..a019938 100644 --- a/sql/dump_reingest_weekly.sql +++ b/sql/dump_reingest_weekly.sql @@ -1,78 +1,42 @@ -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '8 day'::INTERVAL - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'spn2-%' - AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' - AND ingest_file_result.status != 'spn2-error:filesize-limit' - AND ingest_file_result.status != 'spn2-wayback-error' -) TO '/grande/snapshots/reingest_weekly_spn2-error_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '8 day'::INTERVAL - AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_cdx-error_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'cdx-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '8 day'::INTERVAL - AND (ingest_request.ingest_request_source != 'fatcat-changelog' - AND ingest_request.ingest_request_source != 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json'; - -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'wayback-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '8 day'::INTERVAL -) TO '/grande/snapshots/reingest_weekly_wayback-error_current.rows.json'; +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') AND ingest_file_result.hit = false - AND ingest_file_result.status like 'gateway-timeout' AND ingest_request.created < NOW() - '8 hour'::INTERVAL AND ingest_request.created > NOW() - '8 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_gateway-timeout.rows.json'; + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; -COPY ( - SELECT row_to_json(ingest_request.*) FROM ingest_request - LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.ingest_type = 'pdf' - AND ingest_file_result.ingest_type = 'pdf' - AND ingest_file_result.hit = false - AND ingest_file_result.status like 'petabox-error' - AND ingest_request.created < NOW() - '8 hour'::INTERVAL - AND ingest_request.created > NOW() - '8 day'::INTERVAL - AND (ingest_request.ingest_request_source = 'fatcat-changelog' - OR ingest_request.ingest_request_source = 'fatcat-ingest') -) TO '/grande/snapshots/reingest_weekly_petabox-error_current.rows.json'; +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index fb4b0af..a7fb920 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -16,7 +16,7 @@ COPY ( AND ingest_file_result.terminal_sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf.ingest.2020-10-21.json' +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql index 7db34fb..bb9f162 100644 --- a/sql/dump_unextracted_pdf_petabox.sql +++ b/sql/dump_unextracted_pdf_petabox.sql @@ -12,7 +12,7 @@ COPY ( WHERE petabox.sha1hex IS NOT NULL AND pdf_meta.sha1hex IS NULL ) -TO '/grande/snapshots/dump_unextracted_pdf_petabox.2020-07-22.json' +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql index e65edd5..81caf18 100644 --- a/sql/dump_ungrobid_pdf.sql +++ b/sql/dump_ungrobid_pdf.sql @@ -12,7 +12,7 @@ COPY ( -- uncomment/comment this to control whether only fatcat files are included --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) ) -TO '/grande/snapshots/dump_ungrobided_pdf.fatcat.2020-08-04.json' +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql index f758ec2..b7a1db2 100644 --- a/sql/dump_ungrobid_pdf_petabox.sql +++ b/sql/dump_ungrobid_pdf_petabox.sql @@ -11,7 +11,7 @@ COPY ( -- uncomment/comment this to control whether only fatcat files are included AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) ) -TO '/grande/snapshots/dump_ungrobided_pdf_petabox.2020-08-04.json' +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql index d089c7e..333ff7b 100644 --- a/sql/dump_unmatched_glutton_pdf.sql +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -12,7 +12,7 @@ COPY ( AND grobid.fatcat_release IS NOT NULL LIMIT 1000 ) -TO '/grande/snapshots/dump_unmatched_glutton_pdf.2020-06-30.json'; +TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; --TO STDOUT --WITH NULL ''; diff --git a/sql/ingest_again.md b/sql/ingest_again.md index 3b4b990..b749557 100644 --- a/sql/ingest_again.md +++ b/sql/ingest_again.md @@ -12,7 +12,7 @@ AND ingest_file_result.status like 'spn2-%' AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' - ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -25,7 +25,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -38,7 +38,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -49,7 +49,7 @@ AND ingest_file_result.status like 'wayback-error' AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL - ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -62,7 +62,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -75,16 +75,16 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json'; Transform: - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json Push to kafka (shuffled): @@ -122,10 +122,10 @@ Push to kafka (not shuffled): AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' AND ingest_request.ingest_request_source = 'fatcat-ingest' - ) TO '/grande/snapshots/reingest_fatcat_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json'; # note: shuf - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 73bd7f1..33dba66 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -42,7 +42,9 @@ CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); CREATE TABLE IF NOT EXISTS fatcat_file ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), file_ident TEXT CHECK (octet_length(file_ident) = 26), - first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26), + any_url BOOLEAN, + content_scope TEXT CHECK (octet_length(content_scope) >= 1) ); CREATE TABLE IF NOT EXISTS petabox ( @@ -147,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request ( PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); +CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created); CREATE TABLE IF NOT EXISTS ingest_file_result ( ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), @@ -154,7 +157,7 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, hit BOOLEAN NOT NULL, - status TEXT CHECK (octet_length(terminal_url) >= 1), + status TEXT CHECK (octet_length(status) >= 1), terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), terminal_status_code INT, @@ -165,6 +168,43 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); +CREATE TABLE IF NOT EXISTS ingest_fileset_platform ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + + platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1), + platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1), + platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1), + ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1), + total_size BIGINT, + file_count BIGINT, + archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1), + + archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1), + web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1), + web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14), + + manifest JSONB, + -- list, similar to fatcat fileset manifest, plus extra: + -- status (str) + -- path (str) + -- size (int) + -- md5 (str) + -- sha1 (str) + -- sha256 (str) + -- mimetype (str) + -- extra (dict) + -- platform_url (str) + -- terminal_url (str) + -- terminal_dt (str) + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id); + CREATE TABLE IF NOT EXISTS shadow ( shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), @@ -175,3 +215,31 @@ CREATE TABLE IF NOT EXISTS shadow ( PRIMARY KEY(shadow_corpus, shadow_id) ); CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); + +CREATE TABLE IF NOT EXISTS grobid_refs ( + source TEXT NOT NULL CHECK (octet_length(source) >= 1), + source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1), + source_ts TIMESTAMP WITH TIME ZONE, + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + refs_json JSON NOT NULL, + PRIMARY KEY(source, source_id) +); + +CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS + SELECT + crossref.doi as doi, + crossref.indexed as indexed, + crossref.record as record, + grobid_refs.source_ts as source_ts, + grobid_refs.refs_json as refs_json + FROM crossref + LEFT JOIN grobid_refs ON + grobid_refs.source_id = crossref.doi + AND grobid_refs.source = 'crossref'; diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md index cf3b190..0859e79 100644 --- a/sql/monitoring_queries.md +++ b/sql/monitoring_queries.md @@ -168,3 +168,35 @@ Overall status, updated requests past 3 days: GROUP BY ingest_request.ingest_type, ingest_file_result.status ORDER BY COUNT(*) DESC; +## savepapernow and fatcat-ingest recent status + +Specific recent ingests (for debugging): + + -- for record layout: \x + SELECT + ingest_file_result.status as status, + ingest_request.ingest_type as ingest_type, + ingest_request.ingest_request_source as source, + ingest_request.link_source_id as source_id, + ingest_request.base_url as base_url, + ingest_file_result.terminal_dt as dt, + ingest_file_result.terminal_status_code as status_code, + ingest_file_result.terminal_sha1hex as sha1hex, + grobid.status as grobid_status + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid + ON ingest_file_result.terminal_sha1hex = grobid.sha1hex + WHERE + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + -- AND ingest_request.ingest_type = 'pdf' + -- AND ingest_request.ingest_type = 'html' + AND ( + ingest_request.ingest_request_source = 'savepapernow-web' + -- OR ingest_request.ingest_request_source = 'fatcat-ingest' + ) + ORDER BY ingest_file_result.updated DESC + LIMIT 100; + diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh new file mode 100755 index 0000000..d39a171 --- /dev/null +++ b/sql/reingest_bulk.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_bulk_current.json + +cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh new file mode 100755 index 0000000..96e5416 --- /dev/null +++ b/sql/reingest_old.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_old.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_old_current.json + +cat /srv/sandcrawler/tasks/reingest_old_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh index 44a22b3..8a2996c 100755 --- a/sql/reingest_quarterly.sh +++ b/sql/reingest_quarterly.sh @@ -7,14 +7,13 @@ set -o pipefail # fail if part of a '|' command fails sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_quarterly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_quarterly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_quarterly_petabox-error_current.json +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_quarterly_current.json -cat /grande/snapshots/reingest_quarterly_spn2-error_current.json /grande/snapshots/reingest_quarterly_cdx-error_current.json /grande/snapshots/reingest_quarterly_wayback-error_current.json /grande/snapshots/reingest_quarterly_petabox-error_current.json /grande/snapshots/reingest_quarterly_gateway-timeout.json | shuf | head -n100000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 - -cat /grande/snapshots/reingest_quarterly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ + | shuf \ + | head -n120000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh new file mode 100755 index 0000000..c693a64 --- /dev/null +++ b/sql/reingest_spn.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_spn.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \ + > /srv/sandcrawler/tasks/reingest_spn.json + +cat /srv/sandcrawler/tasks/reingest_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh new file mode 100755 index 0000000..5cb6d51 --- /dev/null +++ b/sql/reingest_terminalstatus_forcerecrawl.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json + +cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \ + | shuf \ + | head -n100000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index dfd4869..d2e2444 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -7,14 +7,13 @@ set -o pipefail # fail if part of a '|' command fails sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql cd ../python -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_spn2-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_spn2-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_cdx-error_bulk_current.rows.json | shuf > /grande/snapshots/reingest_weekly_cdx-error_bulk_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_wayback-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_wayback-error_current.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_gateway-timeout.rows.json | shuf > /grande/snapshots/reingest_weekly_gateway-timeout.json -pipenv run ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_weekly_petabox-error_current.rows.json | shuf > /grande/snapshots/reingest_weekly_petabox-error_current.json +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_weekly_current.json -cat /grande/snapshots/reingest_weekly_spn2-error_current.json /grande/snapshots/reingest_weekly_cdx-error_current.json /grande/snapshots/reingest_weekly_wayback-error_current.json /grande/snapshots/reingest_weekly_petabox-error_current.json /grande/snapshots/reingest_weekly_gateway-timeout.json | shuf | head -n40000 | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 - -cat /grande/snapshots/reingest_weekly_cdx-error_bulk.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 +cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ + | shuf \ + | head -n80000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt new file mode 100644 index 0000000..6811b54 --- /dev/null +++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt @@ -0,0 +1,345 @@ + domain | status | count +---------------------------------------+-------------------------+-------- + academic.oup.com | | 4105 + academic.oup.com | spn2-wayback-error | 1393 + academic.oup.com | link-loop | 1025 + academic.oup.com | no-pdf-link | 1020 + academic.oup.com | spn2-cdx-lookup-failure | 512 + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + acp.copernicus.org | | 620 + acp.copernicus.org | success | 537 + aip.scitation.org | | 1310 + aip.scitation.org | blocked-cookie | 1192 + alustath.uobaghdad.edu.iq | | 697 + alustath.uobaghdad.edu.iq | success | 550 + apex.ipk-gatersleben.de | | 1253 + apex.ipk-gatersleben.de | no-pdf-link | 1132 + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + arxiv.org | | 14990 + arxiv.org | success | 12899 + arxiv.org | spn2-wayback-error | 1592 + ashpublications.org | | 563 + asmedigitalcollection.asme.org | | 3990 + asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570 + asmedigitalcollection.asme.org | no-pdf-link | 1449 + asmedigitalcollection.asme.org | link-loop | 734 + assets.researchsquare.com | | 8217 + assets.researchsquare.com | success | 7116 + assets.researchsquare.com | spn2-wayback-error | 946 + av.tib.eu | | 526 + bioone.org | | 588 + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + boris.unibe.ch | | 1420 + boris.unibe.ch | success | 743 + brill.com | | 1773 + brill.com | link-loop | 879 + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + classiques-garnier.com | | 1072 + classiques-garnier.com | success | 807 + content.iospress.com | | 793 + content.iospress.com | link-loop | 568 + cyberdoi.ru | | 775 + cyberdoi.ru | redirect-loop | 775 + cyberleninka.ru | | 1453 + cyberleninka.ru | success | 1092 + d197for5662m48.cloudfront.net | | 632 + d197for5662m48.cloudfront.net | success | 544 + dergipark.org.tr | | 3070 + dergipark.org.tr | success | 1251 + dergipark.org.tr | no-pdf-link | 843 + dergipark.org.tr | spn2-wayback-error | 677 + digi.ub.uni-heidelberg.de | | 502 + dione.lib.unipi.gr | | 783 + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + dl.acm.org | | 1692 + dl.acm.org | blocked-cookie | 1558 + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + doi.nrct.go.th | | 566 + doi.org | | 10408 + doi.org | spn2-cdx-lookup-failure | 9593 + doi.org | terminal-bad-status | 741 + downloads.hindawi.com | | 2137 + downloads.hindawi.com | success | 1787 + dram.journals.ekb.eg | | 541 + elib.spbstu.ru | | 1243 + elib.spbstu.ru | redirect-loop | 1214 + elibrary.vdi-verlag.de | | 1542 + elibrary.vdi-verlag.de | spn2-wayback-error | 721 + elifesciences.org | | 689 + elifesciences.org | success | 521 + epos.myesr.org | | 705 + epos.myesr.org | spn2-wayback-error | 604 + europepmc.org | | 6996 + europepmc.org | success | 6031 + europepmc.org | spn2-wayback-error | 756 + figshare.com | | 1168 + figshare.com | no-pdf-link | 726 + files.osf.io | | 1526 + files.osf.io | success | 1078 + fjfsdata01prod.blob.core.windows.net | | 5410 + fjfsdata01prod.blob.core.windows.net | success | 4581 + fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587 + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + hcommons.org | | 1593 + hcommons.org | success | 1333 + hkvalidate.perfdrive.com | | 1322 + hkvalidate.perfdrive.com | no-pdf-link | 1083 + ieeexplore.ieee.org | | 20997 + ieeexplore.ieee.org | too-many-redirects | 15383 + ieeexplore.ieee.org | spn2-wayback-error | 2555 + ieeexplore.ieee.org | success | 2165 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747 + jamanetwork.com | | 712 + journals.aps.org | | 1698 + journals.aps.org | not-found | 1469 + journals.library.ualberta.ca | | 733 + journals.library.ualberta.ca | success | 594 + journals.lww.com | | 6606 + journals.lww.com | link-loop | 3102 + journals.lww.com | spn2-wayback-error | 1645 + journals.lww.com | terminal-bad-status | 965 + journals.lww.com | spn2-cdx-lookup-failure | 552 + journals.openedition.org | | 4594 + journals.openedition.org | success | 1441 + journals.openedition.org | redirect-loop | 1316 + journals.openedition.org | spn2-wayback-error | 1197 + journals.ub.uni-heidelberg.de | | 1039 + journals.ub.uni-heidelberg.de | success | 728 + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + library.iated.org | | 1560 + library.iated.org | redirect-loop | 1148 + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + mr.crossref.org | | 542 + nsuworks.nova.edu | | 843 + nsuworks.nova.edu | success | 746 + ojs.cvut.cz | | 805 + ojs.cvut.cz | success | 764 + ojs.ugent.be | | 867 + ojs.ugent.be | success | 643 + onepetro.org | | 603 + onlinelibrary.wiley.com | | 1203 + onlinelibrary.wiley.com | blocked-cookie | 758 + open.library.ubc.ca | | 559 + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + oxford.universitypressscholarship.com | | 3556 + oxford.universitypressscholarship.com | link-loop | 2373 + oxford.universitypressscholarship.com | spn2-wayback-error | 562 + painphysicianjournal.com | | 804 + painphysicianjournal.com | success | 668 + papers.ssrn.com | | 6367 + papers.ssrn.com | link-loop | 3865 + papers.ssrn.com | spn2-wayback-error | 1106 + papers.ssrn.com | spn2-cdx-lookup-failure | 1015 + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + pos.sissa.it | | 1455 + pos.sissa.it | success | 1153 + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + publikationen.uni-tuebingen.de | | 659 + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + pubs.acs.org | | 5860 + pubs.acs.org | blocked-cookie | 5185 + pubs.rsc.org | | 2269 + pubs.rsc.org | link-loop | 1384 + res.mdpi.com | | 15776 + res.mdpi.com | success | 13710 + res.mdpi.com | spn2-wayback-error | 1424 + res.mdpi.com | spn2-cdx-lookup-failure | 641 + rrs.scholasticahq.com | | 1078 + rrs.scholasticahq.com | success | 803 + rsdjournal.org | | 755 + rsdjournal.org | success | 524 + s3-eu-west-1.amazonaws.com | | 3343 + s3-eu-west-1.amazonaws.com | success | 2893 + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + sage.figshare.com | | 725 + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + scholarworks.umass.edu | | 1196 + scholarworks.umass.edu | success | 713 + secure.jbs.elsevierhealth.com | | 4202 + secure.jbs.elsevierhealth.com | blocked-cookie | 4169 + storage.googleapis.com | | 1720 + storage.googleapis.com | success | 1466 + tandf.figshare.com | | 789 + tandf.figshare.com | no-pdf-link | 640 + tind-customer-agecon.s3.amazonaws.com | | 584 + turcomat.org | | 1196 + turcomat.org | spn2-wayback-error | 997 + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + utpjournals.press | | 669 + utpjournals.press | blocked-cookie | 616 + watermark.silverchair.com | | 3560 + watermark.silverchair.com | success | 2788 + watermark.silverchair.com | spn2-wayback-error | 685 + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + www.ahajournals.org | | 802 + www.ahajournals.org | blocked-cookie | 597 + www.ajol.info | | 830 + www.ajol.info | success | 575 + www.ams.org | | 868 + www.ams.org | terminal-bad-status | 666 + www.atlantis-press.com | | 1579 + www.atlantis-press.com | success | 1071 + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + www.brazilianjournals.com | | 1385 + www.brazilianjournals.com | success | 1107 + www.cairn.info | | 2479 + www.cairn.info | no-pdf-link | 818 + www.cairn.info | link-loop | 790 + www.cambridge.org | | 6801 + www.cambridge.org | no-pdf-link | 2990 + www.cambridge.org | spn2-wayback-error | 1475 + www.cambridge.org | link-loop | 940 + www.cambridge.org | success | 863 + www.cureus.com | | 538 + www.dbpia.co.kr | | 2958 + www.dbpia.co.kr | redirect-loop | 2953 + www.degruyter.com | | 58612 + www.degruyter.com | no-pdf-link | 41065 + www.degruyter.com | spn2-wayback-error | 7426 + www.degruyter.com | success | 6628 + www.degruyter.com | spn2-cdx-lookup-failure | 1624 + www.degruyter.com | terminal-bad-status | 1565 + www.dovepress.com | | 869 + www.dovepress.com | success | 597 + www.e-manuscripta.ch | | 1047 + www.e3s-conferences.org | | 817 + www.e3s-conferences.org | success | 606 + www.elgaronline.com | | 535 + www.elibrary.ru | | 1244 + www.elibrary.ru | no-pdf-link | 1159 + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + www.eurekaselect.com | | 540 + www.eurosurveillance.org | | 786 + www.eurosurveillance.org | success | 710 + www.finersistemas.com | | 1220 + www.finersistemas.com | success | 1214 + www.frontiersin.org | | 915 + www.frontiersin.org | spn2-wayback-error | 602 + www.hanspub.org | | 618 + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + www.ijcmas.com | | 513 + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + www.indianjournals.com | | 904 + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + www.journals.uchicago.edu | | 6055 + www.journals.uchicago.edu | blocked-cookie | 5927 + www.journals.vu.lt | | 791 + www.journals.vu.lt | success | 545 + www.jstage.jst.go.jp | | 1490 + www.jstage.jst.go.jp | remote-server-error | 1023 + www.jstor.org | | 1103 + www.jstor.org | redirect-loop | 553 + www.karger.com | | 733 + www.liebertpub.com | | 804 + www.liebertpub.com | blocked-cookie | 714 + www.liverpooluniversitypress.co.uk | | 620 + www.liverpooluniversitypress.co.uk | too-many-redirects | 529 + www.mdpi.com | | 3880 + www.mdpi.com | spn2-wayback-error | 1651 + www.mdpi.com | forbidden | 1282 + www.mdpi.com | spn2-cdx-lookup-failure | 714 + www.nepjol.info | | 596 + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + www.osti.gov | | 1147 + www.osti.gov | link-loop | 902 + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + www.preprints.org | | 783 + www.preprints.org | success | 595 + www.repository.cam.ac.uk | | 1146 + www.research-collection.ethz.ch | | 704 + www.research-collection.ethz.ch | terminal-bad-status | 684 + www.researchsquare.com | | 853 + www.researchsquare.com | spn2-wayback-error | 515 + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + www.scielo.br | | 1777 + www.scielo.br | success | 1167 + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + www.scilook.eu | | 812 + www.scilook.eu | success | 563 + www.scirp.org | | 749 + www.tandfonline.com | | 11038 + www.tandfonline.com | blocked-cookie | 9994 + www.tandfonline.com | no-pdf-link | 663 + www.taylorfrancis.com | | 71514 + www.taylorfrancis.com | spn2-wayback-error | 36663 + www.taylorfrancis.com | no-pdf-link | 15098 + www.taylorfrancis.com | forbidden | 8699 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894 + www.taylorfrancis.com | link-loop | 3661 + www.thieme-connect.de | | 3687 + www.thieme-connect.de | redirect-loop | 1187 + www.thieme-connect.de | not-found | 945 + www.thieme-connect.de | no-pdf-link | 941 + www.worldscientific.com | | 1476 + www.worldscientific.com | blocked-cookie | 1323 + www.zora.uzh.ch | | 1118 + zenodo.org | | 43010 + zenodo.org | no-pdf-link | 22015 + zenodo.org | success | 12747 + zenodo.org | spn2-wayback-error | 4608 + zenodo.org | spn2-cdx-lookup-failure | 3215 + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 +(341 rows) + diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt new file mode 100644 index 0000000..57f7e57 --- /dev/null +++ b/sql/stats/2021-11-01_table_sizes.txt @@ -0,0 +1,19 @@ + +Size: 832.66G + + table_name | table_size | indexes_size | total_size +-------------------------------+------------+--------------+------------ + "public"."crossref" | 311 GB | 9812 MB | 320 GB + "public"."ingest_request" | 44 GB | 40 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6952 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 54 GB + "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB +(13 rows) diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt new file mode 100644 index 0000000..3a0e561 --- /dev/null +++ b/sql/stats/2021-11-26_stats.txt @@ -0,0 +1,424 @@ + +Date: Sat 27 Nov 2021 03:33:30 AM UTC + +## SQL Table Sizes + + Size: 937.28G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 393 GB | 10127 MB | 403 GB + "public"."ingest_request" | 44 GB | 41 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6963 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 179761501 | 244453538203113 + + # 179m files, 244 TB + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 179376819 + application/octet-stream | 155379 + text/html | 116102 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 794 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303 + application/x-compress | 272 + application/zip | 131 + image/png | 121 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + video/mp4 | 29 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62196 + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 119049962 | 149169240 + + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 137271670 + warc/revisit | 9709493 + application/octet-stream | 590443 + text/xml | 525481 + text/html | 421030 + unk | 207442 + application/postscript | 81123 + application/save | 80988 + binary/octet-stream | 67476 + image/pdf | 39419 + application/x-download | 38278 + text/plain | 36159 + application/download | 34328 + application/force-download | 19729 + multipart/form-data | 9105 + application | 5299 + application/x-msdownload | 3851 + application/x-octetstream | 3649 + .pdf | 3318 + application/x-pdf | 2992 + pdf | 1484 + file | 1364 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1029 + application/unknown | 1000 + 0 | 764 + text/pdf | 704 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 111236904 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 102962304 + 500 | 8269129 + -4 | 5013 + 503 | 548 + +TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF +files through by mistake + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.5.5-fatcat | 89983404 + | 12892161 + 0.7.0-104-gbeebd9a6b | 86739 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 43701948 + pdf | unpaywall | 37802895 + pdf | doi | 28736398 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2383398 + pdf | arxiv | 2330054 + html | doi | 39725 + xml | doaj | 9442 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 689 + html | spn | 48 + xml | spn | 1 + (15 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 43701948 + pdf | unpaywall | unpaywall | 37802895 + pdf | doi | fatcat-changelog | 16207728 + pdf | doi | fatcat-ingest | 9012282 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767705 + pdf | arxiv | fatcat-changelog | 562320 + pdf | pmc | fatcat-ingest | 297527 + pdf | pmc | fatcat-changelog | 57046 + html | doi | fatcat-ingest | 37788 + xml | doaj | doaj | 9442 + pdf | cnki_covid19 | scrape-covid19 | 2034 + html | doi | fatcat-changelog | 1897 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 689 + pdf | doi | savepapernow-web | 613 + html | spn | savepapernow-web | 48 + html | doi | savepapernow-web | 40 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | spn | savepapernow-web | 1 + (25 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 169076 + pdf | oai | 15283 + pdf | doaj | 2063 + html | doaj | 620 + pdf | doi | 22 + pdf | unpaywall | 17 + + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14554221 | 0.284 + pdf | mag | 43701948 | 32643175 | 0.747 + pdf | unpaywall | 37802895 | 29989257 | 0.793 + pdf | doi | 28736547 | 7690393 | 0.268 + pdf | doaj | 4264610 | 2851601 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2383398 | 1821071 | 0.764 + pdf | arxiv | 2330054 | 2159738 | 0.927 + html | doi | 39725 | 1235 | 0.031 + xml | doaj | 9442 | 6897 | 0.730 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 689 | 503 | 0.730 + html | spn | 48 | 5 | 0.104 + xml | spn | 1 | 0 | 0.000 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 78944243 + pdf | no-pdf-link | 26270027 + pdf | no-capture | 23267156 + pdf | redirect-loop | 9837466 + pdf | terminal-bad-status | 4147454 + pdf | skip-url-blocklist | 3088907 + pdf | link-loop | 2953891 + pdf | blocked-cookie | 1855541 + html | wrong-scope | 1106171 + pdf | wrong-mimetype | 859941 + pdf | gateway-timeout | 729771 + pdf | spn2-cdx-lookup-failure | 584856 + html | no-capture | 423917 + pdf | forbidden | 390804 + pdf | cdx-error | 363091 + pdf | wayback-content-error | 354894 + pdf | null-body | 341698 + pdf | too-many-redirects | 307096 + pdf | not-found | 294592 + html | redirect-loop | 213032 + html | unknown-scope | 207923 + pdf | spn2-error | 192046 + html | html-resource-no-capture | 166119 + html | success | 124177 + pdf | wayback-error | 105385 + html | null-body | 100296 + pdf | spn2-wayback-error | 73176 + pdf | remote-server-error | 60908 + pdf | spn2-error:too-many-redirects | 58076 + pdf | skip-wall | 57744 + html | wayback-content-error | 53928 + pdf | read-timeout | 42465 + pdf | invalid-host-resolution | 37221 + pdf | petabox-error | 28765 + pdf | spn2-error:unknown | 23885 + html | wrong-mimetype | 18930 + pdf | bad-redirect | 14708 + html | terminal-bad-status | 14070 + html | petabox-error | 13770 + html | spn2-cdx-lookup-failure | 13002 + pdf | spn2-error:job-failed | 9721 + html | cdx-error | 7167 + xml | success | 6897 + pdf | spn2-error:bad-request | 4433 + pdf | spn-remote-error | 4206 + pdf | body-too-large | 3019 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1900 + pdf | spn2-error:proxy-error | 1850 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36821458 + pdf | | 26058729 + pdf | 301 | 8466302 + html | 200 | 1676730 + pdf | 503 | 1028504 + pdf | 302 | 949465 + pdf | 403 | 936737 + pdf | 404 | 687661 + pdf | 400 | 507303 + html | | 439356 + pdf | 401 | 288994 + pdf | 500 | 263775 + html | 301 | 211796 + pdf | 303 | 130719 + pdf | 410 | 66495 + pdf | 502 | 41760 + pdf | 429 | 35266 + pdf | 420 | 26722 + pdf | 409 | 15204 + pdf | 509 | 15113 + pdf | 999 | 11409 + html | 404 | 9578 + pdf | 307 | 8404 + pdf | 308 | 5514 + pdf | 202 | 4724 + html | 500 | 3628 + xml | 200 | 2537 + pdf | 520 | 2199 + pdf | 206 | 1694 + html | 302 | 1138 + pdf | 504 | 1124 + pdf | 521 | 1085 + pdf | 412 | 921 + pdf | 421 | 714 + pdf | 300 | 461 + pdf | 505 | 436 + pdf | 406 | 427 + pdf | 508 | 408 + html | 403 | 382 + html | 503 | 378 + html | 303 | 268 + pdf | 204 | 252 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 523 | 55 + pdf | 408 | 53 + pdf | 432 | 45 + pdf | 530 | 31 + pdf | 416 | 31 + (50 rows) diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt new file mode 100644 index 0000000..b03c370 --- /dev/null +++ b/sql/stats/2021-12-02_table_sizes.txt @@ -0,0 +1,22 @@ + +Size: 940.66G + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 394 GB | 10138 MB | 404 GB + "public"."ingest_request" | 44 GB | 41 GB | 85 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6978 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes +(16 rows) diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt new file mode 100644 index 0000000..bd20c5c --- /dev/null +++ b/sql/stats/2022-04-26_stats.txt @@ -0,0 +1,432 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 416 GB | 10 GB | 426 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 58 GB | 41 GB | 99 GB + "public"."ingest_request" | 50 GB | 48 GB | 98 GB + "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 37 GB | 34 GB | 71 GB + "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB + "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 3015 MB | 31 MB | 3046 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 192402128 | 271919997557597 + (1 row) + + # 271,919,997,557,597 -> ~272 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 191760695 + text/html | 330351 + application/octet-stream | 186696 + application/xml | 42170 + application/xhtml+xml | 31470 + text/plain | 16449 + application/jats+xml | 6902 + application/gzip | 6681 + | 6033 + application/postscript | 4916 + image/jpeg | 2901 + application/vnd.ms-powerpoint | 1672 + application/msword | 934 + application/x-bzip2 | 891 + image/png | 476 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294 + application/x-compress | 274 + video/mp4 | 150 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12831 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 130732381 | 162760251 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 149749828 + warc/revisit | 10437210 + application/octet-stream | 733161 + text/html | 642992 + text/xml | 525483 + unk | 217642 + application/postscript | 81127 + application/save | 81023 + binary/octet-stream | 67938 + application/x-download | 41137 + image/pdf | 39712 + application/download | 37153 + text/plain | 36342 + application/force-download | 21496 + multipart/form-data | 9792 + application | 5366 + application/x-octetstream | 5166 + application/x-msdownload | 3851 + .pdf | 3445 + application/x-pdf | 3018 + pdf | 1618 + file | 1370 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1047 + application/unknown | 1001 + 0 | 773 + text/pdf | 729 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 123669603 + (1 row) + + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 115668412 + 500 | 7995428 + -4 | 5745 + 503 | 18 + (4 rows) + + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 54780825 + 0.5.5-fatcat | 48003940 + | 12694404 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | mag | 43701948 + pdf | doi | 40044585 + pdf | doaj | 6016771 + html | doaj | 3648181 + pdf | arxiv | 2676200 + pdf | pmc | 2402453 + html | doi | 41492 + xml | doaj | 20638 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 829 + html | spn | 52 + xml | doi | 1 + xml | spn | 1 + (16 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 20936949 + pdf | doi | fatcat-ingest | 15590201 + pdf | doaj | doaj | 6016771 + html | doaj | doaj | 3648181 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 691405 + pdf | pmc | fatcat-ingest | 297646 + pdf | pmc | fatcat-changelog | 75982 + html | doi | fatcat-ingest | 37904 + xml | doaj | doaj | 20638 + html | doi | fatcat-changelog | 3534 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1562 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 829 + html | doi | savepapernow-web | 54 + html | spn | savepapernow-web | 52 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | doi | savepapernow-web | 1 + xml | spn | savepapernow-web | 1 + (26 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doaj | 1619621 + html | doaj | 1208412 + pdf | mag | 167653 + pdf | oai | 15282 + xml | doaj | 11196 + pdf | unpaywall | 270 + pdf | doi | 22 + (7 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 15968290 | 0.312 + pdf | unpaywall | 43932525 | 32618045 | 0.742 + pdf | mag | 43701948 | 32662926 | 0.747 + pdf | doi | 40044738 | 10925369 | 0.273 + pdf | doaj | 6016771 | 3042569 | 0.506 + html | doaj | 3648181 | 344208 | 0.094 + pdf | arxiv | 2676206 | 2269708 | 0.848 + pdf | pmc | 2402453 | 1855679 | 0.772 + html | doi | 41492 | 1739 | 0.042 + xml | doaj | 20638 | 6899 | 0.334 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 829 | 616 | 0.743 + html | spn | 52 | 7 | 0.135 + xml | doi | 1 | 0 | 0.000 + xml | spn | 1 | 0 | 0.000 + (16 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+---------------------------------+---------- + pdf | success | 85709322 + pdf | no-pdf-link | 29713304 + pdf | no-capture | 26632191 + pdf | redirect-loop | 10979145 + pdf | terminal-bad-status | 4977000 + pdf | link-loop | 3434877 + pdf | skip-url-blocklist | 3114258 + pdf | blocked-cookie | 2156835 + html | wrong-scope | 1126911 + pdf | wrong-mimetype | 980546 + pdf | gateway-timeout | 651562 + pdf | spn2-cdx-lookup-failure | 484016 + pdf | spn2-backoff | 399382 + pdf | cdx-error | 373964 + pdf | wayback-content-error | 354370 + html | success | 345860 + pdf | null-body | 336182 + pdf | spn2-error:500 | 309755 + pdf | forbidden | 291175 + pdf | not-found | 275560 + pdf | too-many-redirects | 262312 + html | unknown-scope | 230352 + html | redirect-loop | 226596 + html | html-resource-no-capture | 205646 + html | no-capture | 164014 + component | spn2-cdx-lookup-failure | 148825 + component | wrong-mimetype | 130344 + html | null-body | 100296 + pdf | wayback-error | 94286 + pdf | spn2-wayback-error | 81365 + component | no-capture | 75278 + pdf | spn2-error | 69830 + pdf | skip-wall | 57744 + pdf | spn2-error:too-many-redirects | 53808 + pdf | remote-server-error | 41286 + pdf | petabox-error | 38800 + pdf | invalid-host-resolution | 37337 + pdf | read-timeout | 36872 + component | spn2-backoff | 33217 + pdf | empty-blob | 27946 + component | spn2-error | 24078 + pdf | spn2-error:unknown | 23697 + component | gateway-timeout | 23139 + html | wrong-mimetype | 22731 + html | wayback-content-error | 20507 + pdf | spn2-error:host-crawling-paused | 19900 + pdf | bad-redirect | 19183 + html | terminal-bad-status | 13354 + component | blocked-cookie | 12287 + component | spn2-error:500 | 11271 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 38144779 + pdf | | 32762240 + pdf | 301 | 9433087 + html | 200 | 1716127 + pdf | 403 | 1416632 + pdf | 302 | 1134668 + pdf | 404 | 888853 + pdf | 401 | 746311 + pdf | 503 | 655894 + pdf | 400 | 531479 + component | | 337603 + pdf | 500 | 247944 + html | 301 | 224237 + html | | 167194 + pdf | 303 | 135048 + component | 200 | 130663 + pdf | 429 | 93489 + pdf | 410 | 67392 + pdf | 420 | 26722 + pdf | 502 | 18770 + pdf | 409 | 15152 + pdf | 509 | 15113 + pdf | 999 | 11747 + html | 404 | 9879 + pdf | 307 | 8895 + pdf | 412 | 7053 + pdf | 308 | 6627 + pdf | 202 | 5289 + xml | 200 | 2540 + html | 500 | 2480 + pdf | 520 | 2220 + pdf | 521 | 1844 + pdf | 206 | 1739 + html | 302 | 1407 + pdf | 504 | 1146 + html | 303 | 1123 + pdf | 421 | 986 + pdf | 406 | 938 + pdf | 204 | 498 + pdf | 505 | 468 + pdf | 300 | 436 + pdf | 508 | 422 + pdf | 426 | 405 + html | 429 | 402 + html | 403 | 398 + pdf | 432 | 366 + component | 301 | 294 + pdf | 405 | 210 + pdf | 226 | 166 + component | 302 | 128 + (50 rows) + diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt new file mode 100644 index 0000000..864abd4 --- /dev/null +++ b/sql/stats/2022-04-27_crawl_changelog.txt @@ -0,0 +1,191 @@ + domain | status | count +--------------------------------------+-------------------------+-------- + academic.oup.com | | 1243 + academic.oup.com | spn2-cdx-lookup-failure | 990 + aip.scitation.org | | 313 + aip.scitation.org | spn2-cdx-lookup-failure | 224 + ajps.uomustansiriyah.edu.iq | | 235 + apps.crossref.org | | 1329 + apps.crossref.org | spn2-cdx-lookup-failure | 942 + apps.crossref.org | no-pdf-link | 387 + archaeologydataservice.ac.uk | | 422 + archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289 + arxiv.org | | 3512 + arxiv.org | spn2-cdx-lookup-failure | 2319 + arxiv.org | success | 1177 + assets.researchsquare.com | | 571 + assets.researchsquare.com | spn2-cdx-lookup-failure | 322 + assets.researchsquare.com | success | 249 + brill.com | | 397 + brill.com | spn2-cdx-lookup-failure | 265 + cla.berkeley.edu | | 239 + classiques-garnier.com | | 249 + cyberleninka.ru | | 340 + cyberleninka.ru | spn2-cdx-lookup-failure | 244 + dergipark.org.tr | | 468 + dergipark.org.tr | spn2-cdx-lookup-failure | 333 + dl.acm.org | | 592 + dl.acm.org | spn2-cdx-lookup-failure | 470 + doi.ala.org.au | | 288 + doi.ala.org.au | spn2-cdx-lookup-failure | 220 + doi.org | | 1107 + doi.org | terminal-bad-status | 679 + doi.org | spn2-cdx-lookup-failure | 415 + downloads.hindawi.com | | 279 + downloads.hindawi.com | success | 267 + edbs.uomustansiriyah.edu.iq | | 294 + edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209 + elibrary.kdpu.edu.ua | | 320 + elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233 + elibrary.ru | | 722 + elibrary.ru | spn2-cdx-lookup-failure | 505 + europepmc.org | | 986 + europepmc.org | spn2-cdx-lookup-failure | 681 + europepmc.org | success | 291 + figshare.com | | 377 + figshare.com | spn2-cdx-lookup-failure | 328 + fjfsdata01prod.blob.core.windows.net | | 255 + fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216 + hammer.purdue.edu | | 224 + ieeexplore.ieee.org | | 3904 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654 + ieeexplore.ieee.org | gateway-timeout | 792 + ieeexplore.ieee.org | spn2-backoff | 419 + journals.eco-vector.com | | 428 + journals.eco-vector.com | spn2-cdx-lookup-failure | 306 + journals.lww.com | | 727 + journals.lww.com | spn2-cdx-lookup-failure | 622 + journals.openedition.org | | 806 + journals.openedition.org | spn2-cdx-lookup-failure | 554 + journals.plos.org | | 348 + journals.plos.org | spn2-cdx-lookup-failure | 244 + kiss.kstudy.com | | 226 + kluwerlawonline.com | | 723 + kluwerlawonline.com | spn2-cdx-lookup-failure | 489 + kluwerlawonline.com | link-loop | 203 + linkinghub.elsevier.com | | 401 + linkinghub.elsevier.com | spn2-backoff | 342 + mdpi-res.com | | 1463 + mdpi-res.com | success | 1337 + muse.jhu.edu | | 346 + muse.jhu.edu | spn2-cdx-lookup-failure | 253 + onepetro.org | | 363 + onepetro.org | spn2-cdx-lookup-failure | 284 + online.ucpress.edu | | 1620 + online.ucpress.edu | spn2-cdx-lookup-failure | 1511 + onlinelibrary.wiley.com | | 2913 + onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109 + onlinelibrary.wiley.com | terminal-bad-status | 787 + opendata.uni-halle.de | | 519 + opendata.uni-halle.de | spn2-cdx-lookup-failure | 343 + osf.io | | 1554 + osf.io | spn2-cdx-lookup-failure | 1350 + papers.ssrn.com | | 2207 + papers.ssrn.com | spn2-cdx-lookup-failure | 1727 + papers.ssrn.com | link-loop | 457 + psycharchives.org | | 384 + psycharchives.org | spn2-cdx-lookup-failure | 283 + publons.com | | 493 + publons.com | spn2-cdx-lookup-failure | 348 + pubs.acs.org | | 1240 + pubs.acs.org | spn2-cdx-lookup-failure | 881 + pubs.acs.org | terminal-bad-status | 298 + pubs.rsc.org | | 603 + pubs.rsc.org | spn2-cdx-lookup-failure | 460 + repositories.lib.utexas.edu | | 1861 + repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288 + repositories.lib.utexas.edu | terminal-bad-status | 523 + s3-eu-west-1.amazonaws.com | | 216 + sage.figshare.com | | 374 + sage.figshare.com | spn2-cdx-lookup-failure | 309 + scholar.dkyobobook.co.kr | | 220 + scholarworks.gsu.edu | | 749 + scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577 + tandf.figshare.com | | 214 + www.atlantis-press.com | | 338 + www.atlantis-press.com | spn2-cdx-lookup-failure | 214 + www.cairn.info | | 782 + www.cairn.info | spn2-cdx-lookup-failure | 541 + www.cambridge.org | | 2325 + www.cambridge.org | spn2-cdx-lookup-failure | 1787 + www.cambridge.org | no-pdf-link | 300 + www.cell.com | | 213 + www.concrete.org | | 476 + www.concrete.org | spn2-cdx-lookup-failure | 340 + www.dbpia.co.kr | | 375 + www.dbpia.co.kr | spn2-cdx-lookup-failure | 275 + www.degruyter.com | | 3849 + www.degruyter.com | spn2-cdx-lookup-failure | 2969 + www.degruyter.com | no-pdf-link | 712 + www.dib.ie | | 1100 + www.dib.ie | spn2-cdx-lookup-failure | 1038 + www.e-periodica.ch | | 821 + www.e-periodica.ch | spn2-cdx-lookup-failure | 620 + www.e-periodica.ch | no-pdf-link | 201 + www.elibrary.ru | | 401 + www.elibrary.ru | spn2-cdx-lookup-failure | 281 + www.emerald.com | | 390 + www.emerald.com | spn2-cdx-lookup-failure | 275 + www.eurekaselect.com | | 275 + www.frontiersin.org | | 1266 + www.frontiersin.org | spn2-cdx-lookup-failure | 1025 + www.hanspub.org | | 229 + www.hindawi.com | | 604 + www.hindawi.com | spn2-cdx-lookup-failure | 594 + www.inderscience.com | | 201 + www.jstage.jst.go.jp | | 1094 + www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807 + www.jstage.jst.go.jp | success | 206 + www.mdpi.com | | 4340 + www.mdpi.com | spn2-cdx-lookup-failure | 4258 + www.nomos-elibrary.de | | 2749 + www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909 + www.nomos-elibrary.de | redirect-loop | 819 + www.osti.gov | | 275 + www.oxfordhandbooks.com | | 248 + www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224 + www.pdcnet.org | | 217 + www.researchsquare.com | | 483 + www.researchsquare.com | spn2-cdx-lookup-failure | 317 + www.scielo.br | | 319 + www.scielo.br | spn2-cdx-lookup-failure | 222 + www.sciencedirect.com | | 3384 + www.sciencedirect.com | spn2-cdx-lookup-failure | 3267 + www.spiedigitallibrary.org | | 441 + www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327 + www.tandfonline.com | | 2401 + www.tandfonline.com | spn2-cdx-lookup-failure | 1552 + www.tandfonline.com | no-pdf-link | 303 + www.tandfonline.com | blocked-cookie | 250 + www.taylorfrancis.com | | 1232 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 908 + www.thieme-connect.de | | 520 + www.thieme-connect.de | spn2-cdx-lookup-failure | 366 + www.worldscientific.com | | 383 + www.worldscientific.com | spn2-cdx-lookup-failure | 276 + zenodo.org | | 10625 + zenodo.org | spn2-cdx-lookup-failure | 7777 + zenodo.org | success | 1574 + zenodo.org | no-pdf-link | 1160 + zivahub.uct.ac.za | | 3428 + zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845 + zivahub.uct.ac.za | no-pdf-link | 583 + | | 130491 + | spn2-cdx-lookup-failure | 95169 + | success | 13354 + | no-pdf-link | 9621 + | terminal-bad-status | 3385 + | spn2-backoff | 2396 + | redirect-loop | 2216 + | link-loop | 1850 + | gateway-timeout | 1061 + | spn2-error:blocked-url | 428 + | blocked-cookie | 415 + | spn2-error | 246 +(182 rows) + +---- + +The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a +week or two, when crawling and retries are running smoothly, and see what +things look like then. diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt new file mode 100644 index 0000000..8d98217 --- /dev/null +++ b/sql/stats/2022-05-11_crawl_changelog.txt @@ -0,0 +1,410 @@ + domain | status | count +-----------------------------------------------------------------+-------------------------+-------- + academic.oup.com | | 2210 + academic.oup.com | no-pdf-link | 1350 + academic.oup.com | bad-redirect | 510 + academiccommons.columbia.edu | | 379 + academiccommons.columbia.edu | success | 339 + aip.scitation.org | | 762 + aip.scitation.org | terminal-bad-status | 430 + apps.crossref.org | | 9894 + apps.crossref.org | no-pdf-link | 9886 + apps.euskadi.eus | | 242 + apps.euskadi.eus | no-pdf-link | 240 + arxiv.org | | 44889 + arxiv.org | success | 28781 + arxiv.org | spn2-backoff | 7975 + arxiv.org | terminal-bad-status | 4508 + arxiv.org | spn2-cdx-lookup-failure | 2010 + arxiv.org | redirect-loop | 619 + arxiv.org | no-pdf-link | 242 + arxiv.org | spn2-error | 236 + asa.scitation.org | | 356 + asa.scitation.org | terminal-bad-status | 299 + asmedigitalcollection.asme.org | | 240 + assets.cureus.com | | 336 + assets.cureus.com | success | 335 + assets.researchsquare.com | | 1042 + assets.researchsquare.com | success | 993 + av.tib.eu | | 205 + av.tib.eu | no-pdf-link | 203 + bibliographie.uni-tuebingen.de | | 213 + bibliographie.uni-tuebingen.de | no-pdf-link | 211 + biorxiv.org | redirect-loop | 217 + biorxiv.org | | 217 + books.openedition.org | | 691 + books.openedition.org | no-pdf-link | 687 + boris.unibe.ch | | 525 + boris.unibe.ch | success | 466 + bridges.monash.edu | | 663 + bridges.monash.edu | no-pdf-link | 647 + brill.com | | 860 + brill.com | success | 434 + chemrxiv.org | | 201 + classiques-garnier.com | | 242 + content.iospress.com | | 325 + content.iospress.com | link-loop | 247 + core.tdar.org | | 216 + core.tdar.org | no-pdf-link | 211 + cyberleninka.ru | | 646 + cyberleninka.ru | success | 620 + d197for5662m48.cloudfront.net | | 263 + d197for5662m48.cloudfront.net | success | 262 + dergipark.org.tr | | 891 + dergipark.org.tr | success | 526 + dergipark.org.tr | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 427 + digi.ub.uni-heidelberg.de | no-pdf-link | 427 + direct.mit.edu | | 268 + direct.mit.edu | no-pdf-link | 208 + dl.acm.org | | 1719 + dl.acm.org | success | 829 + dl.acm.org | no-pdf-link | 546 + dl.acm.org | terminal-bad-status | 205 + dlc.library.columbia.edu | | 385 + dlc.library.columbia.edu | terminal-bad-status | 319 + doi.ala.org.au | | 724 + doi.ala.org.au | no-pdf-link | 721 + doi.apa.org | | 214 + doi.org | | 3390 + doi.org | terminal-bad-status | 2938 + doi.org | redirect-loop | 233 + doi.org | spn2-wayback-error | 208 + doi.usp.org | | 325 + doi.usp.org | no-pdf-link | 324 + downloads.hindawi.com | | 1439 + downloads.hindawi.com | success | 1436 + du.diva-portal.org | | 589 + du.diva-portal.org | success | 586 + econtents.bc.unicamp.br | | 310 + econtents.bc.unicamp.br | success | 310 + ediss.uni-goettingen.de | | 728 + ediss.uni-goettingen.de | success | 425 + elibrary.kdpu.edu.ua | | 907 + elibrary.kdpu.edu.ua | bad-redirect | 712 + elibrary.ru | | 925 + elibrary.ru | terminal-bad-status | 492 + elibrary.ru | bad-redirect | 230 + elibrary.vdi-verlag.de | | 393 + elifesciences.org | | 296 + elifesciences.org | success | 276 + europepmc.org | | 3024 + europepmc.org | success | 2541 + europepmc.org | terminal-bad-status | 463 + figshare.com | | 493 + figshare.com | no-pdf-link | 440 + files.osf.io | | 883 + files.osf.io | success | 686 + fjfsdata01prod.blob.core.windows.net | | 3869 + fjfsdata01prod.blob.core.windows.net | success | 3818 + ieeexplore.ieee.org | | 10854 + ieeexplore.ieee.org | gateway-timeout | 5495 + ieeexplore.ieee.org | spn2-backoff | 1662 + ieeexplore.ieee.org | no-pdf-link | 1417 + ieeexplore.ieee.org | success | 1410 + ieeexplore.ieee.org | redirect-loop | 768 + iiif.crossasia.org | | 7608 + iiif.crossasia.org | no-pdf-link | 7568 + ikee.lib.auth.gr | | 450 + ikee.lib.auth.gr | success | 332 + ins.journals.ekb.eg | | 212 + iopscience.iop.org | | 268 + jamanetwork.com | | 333 + journals.aps.org | | 414 + journals.asm.org | | 242 + journals.flvc.org | | 245 + journals.flvc.org | success | 242 + journals.healio.com | | 755 + journals.healio.com | terminal-bad-status | 668 + journals.lincoln.ac.nz | | 244 + journals.lincoln.ac.nz | success | 239 + journals.lww.com | | 1772 + journals.lww.com | link-loop | 1425 + journals.lww.com | spn2-backoff | 209 + journals.openedition.org | | 1192 + journals.openedition.org | redirect-loop | 467 + journals.openedition.org | success | 451 + journals.plos.org | | 771 + journals.plos.org | success | 750 + journals.ub.uni-heidelberg.de | | 787 + journals.ub.uni-heidelberg.de | success | 741 + kazanmedjournal.ru | | 240 + kazanmedjournal.ru | success | 231 + kiss.kstudy.com | | 219 + kiss.kstudy.com | no-pdf-link | 218 + kluwerlawonline.com | | 444 + kluwerlawonline.com | link-loop | 402 + libraetd.lib.virginia.edu | | 362 + libraetd.lib.virginia.edu | no-pdf-link | 361 + link.springer.com | | 305 + linkinghub.elsevier.com | | 568 + linkinghub.elsevier.com | spn2-backoff | 545 + ltu-figshare-repo.s3.aarnet.edu.au | | 269 + ltu-figshare-repo.s3.aarnet.edu.au | success | 268 + mausamjournal.imd.gov.in | | 202 + mdpi-res.com | | 8892 + mdpi-res.com | success | 8863 + mededpublish.org | | 1900 + mededpublish.org | no-pdf-link | 1900 + meetingorganizer.copernicus.org | | 276 + meetingorganizer.copernicus.org | no-pdf-link | 271 + muse.jhu.edu | | 1047 + muse.jhu.edu | terminal-bad-status | 755 + muse.jhu.edu | link-loop | 203 + online.ucpress.edu | | 358 + online.ucpress.edu | link-loop | 212 + onlinelibrary.wiley.com | | 5813 + onlinelibrary.wiley.com | terminal-bad-status | 4587 + onlinelibrary.wiley.com | spn2-wayback-error | 614 + onlinelibrary.wiley.com | blocked-cookie | 381 + open.library.ubc.ca | | 206 + opendata.uni-halle.de | | 1768 + opendata.uni-halle.de | success | 1215 + opendata.uni-halle.de | wrong-mimetype | 260 + opendata2.uni-halle.de | | 206 + opg.optica.org | | 205 + osf.io | | 2949 + osf.io | no-pdf-link | 2404 + osf.io | spn2-backoff | 299 + papers.ssrn.com | | 3962 + papers.ssrn.com | link-loop | 3800 + peerj.com | | 273 + preprints.jmir.org | | 275 + preprints.jmir.org | cdx-error | 255 + publikationen.bibliothek.kit.edu | | 213 + publons.com | | 593 + publons.com | no-pdf-link | 590 + pubs.acs.org | | 2288 + pubs.acs.org | terminal-bad-status | 1841 + pubs.acs.org | spn2-wayback-error | 210 + pubs.rsc.org | | 1698 + pubs.rsc.org | bad-redirect | 811 + pubs.rsc.org | link-loop | 352 + pubs.rsc.org | success | 307 + radiopaedia.org | | 220 + read.dukeupress.edu | | 303 + repositories.lib.utexas.edu | | 1570 + repositories.lib.utexas.edu | bad-redirect | 513 + repositories.lib.utexas.edu | spn2-backoff | 383 + repositories.lib.utexas.edu | gateway-timeout | 379 + repositories.lib.utexas.edu | terminal-bad-status | 282 + repository.uj.ac.za | | 489 + repository.uj.ac.za | no-pdf-link | 365 + repository.unsworks.unsw.edu.au | | 397 + repository.urosario.edu.co | | 2429 + repository.urosario.edu.co | success | 1648 + repository.urosario.edu.co | bad-redirect | 613 + rex.libraries.wsu.edu | no-pdf-link | 241 + rex.libraries.wsu.edu | | 241 + rsdjournal.org | | 208 + rsdjournal.org | success | 208 + s3-ap-southeast-2.amazonaws.com | | 282 + s3-ap-southeast-2.amazonaws.com | success | 277 + s3-eu-west-1.amazonaws.com | | 4615 + s3-eu-west-1.amazonaws.com | success | 4593 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237 + sage.figshare.com | | 415 + sage.figshare.com | no-pdf-link | 385 + scholar.dkyobobook.co.kr | | 512 + scholar.dkyobobook.co.kr | no-pdf-link | 509 + scholarlypublishingcollective.org | | 287 + scholarworks.gsu.edu | | 1132 + scholarworks.gsu.edu | success | 1000 + scholarworks.iupui.edu | | 205 + scholarworks.umass.edu | | 417 + scholarworks.umass.edu | success | 400 + sciencescholar.us | | 404 + secure.jbs.elsevierhealth.com | | 727 + secure.jbs.elsevierhealth.com | terminal-bad-status | 722 + tandf.figshare.com | | 354 + tandf.figshare.com | no-pdf-link | 342 + unsworks.unsw.edu.au | | 408 + unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342 + valep.vc.univie.ac.at | no-pdf-link | 737 + valep.vc.univie.ac.at | | 737 + watermark.silverchair.com | | 1604 + watermark.silverchair.com | success | 1598 + wayf.switch.ch | | 215 + wayf.switch.ch | no-pdf-link | 213 + www.ahajournals.org | | 438 + www.ahajournals.org | no-pdf-link | 306 + www.ahbps.org | | 316 + www.ahbps.org | success | 312 + www.atenaeditora.com.br | | 390 + www.atenaeditora.com.br | terminal-bad-status | 333 + www.atlantis-press.com | | 914 + www.atlantis-press.com | success | 901 + www.atsjournals.org | | 1245 + www.atsjournals.org | success | 1189 + www.biorxiv.org | | 712 + www.biorxiv.org | success | 670 + www.bloomsburycollections.com | | 982 + www.bloomsburycollections.com | terminal-bad-status | 566 + www.cahiers-clsl.ch | | 305 + www.cahiers-clsl.ch | success | 298 + www.cairn.info | | 1799 + www.cairn.info | no-pdf-link | 662 + www.cairn.info | link-loop | 487 + www.cairn.info | success | 355 + www.cairn.info | terminal-bad-status | 267 + www.cambridge.org | | 3258 + www.cambridge.org | no-pdf-link | 1682 + www.cambridge.org | success | 682 + www.cambridge.org | bad-redirect | 404 + www.cambridge.org | link-loop | 302 + www.dbpia.co.kr | | 763 + www.dbpia.co.kr | no-pdf-link | 443 + www.dbpia.co.kr | redirect-loop | 287 + www.degruyter.com | | 12655 + www.degruyter.com | no-pdf-link | 9112 + www.degruyter.com | success | 2898 + www.degruyter.com | spn2-backoff | 507 + www.dib.ie | | 1381 + www.dib.ie | no-pdf-link | 1378 + www.dovepress.com | | 231 + www.dovepress.com | success | 216 + www.e-manuscripta.ch | | 767 + www.e-manuscripta.ch | success | 399 + www.e-periodica.ch | | 1406 + www.e-periodica.ch | no-pdf-link | 1402 + www.e-rara.ch | no-pdf-link | 251 + www.e-rara.ch | | 251 + www.editoracientifica.org | no-pdf-link | 205 + www.editoracientifica.org | | 205 + www.elgaronline.com | | 427 + www.elibrary.ru | | 616 + www.elibrary.ru | terminal-bad-status | 364 + www.elibrary.ru | no-pdf-link | 216 + www.emerald.com | | 862 + www.emerald.com | no-pdf-link | 724 + www.endocrine-abstracts.org | | 1907 + www.endocrine-abstracts.org | no-pdf-link | 1905 + www.eurekaselect.com | | 285 + www.eurekaselect.com | link-loop | 246 + www.even3.com.br | | 233 + www.frontiersin.org | | 585 + www.frontiersin.org | spn2-backoff | 436 + www.humankineticslibrary.com | no-pdf-link | 207 + www.humankineticslibrary.com | | 207 + www.igi-global.com | | 1600 + www.igi-global.com | no-pdf-link | 1199 + www.igi-global.com | bad-redirect | 258 + www.inderscience.com | | 385 + www.inderscience.com | no-pdf-link | 365 + www.inderscienceonline.com | | 202 + www.ingentaconnect.com | | 450 + www.ingentaconnect.com | no-pdf-link | 260 + www.jstage.jst.go.jp | | 1248 + www.jstage.jst.go.jp | success | 870 + www.karger.com | | 313 + www.liebertpub.com | | 271 + www.liebertpub.com | no-pdf-link | 241 + www.nicecjournal.co.uk | | 274 + www.nicecjournal.co.uk | success | 274 + www.nomos-elibrary.de | | 1771 + www.nomos-elibrary.de | no-pdf-link | 788 + www.nomos-elibrary.de | redirect-loop | 506 + www.nomos-elibrary.de | bad-redirect | 207 + www.osti.gov | | 381 + www.osti.gov | link-loop | 326 + www.persee.fr | | 277 + www.preprints.org | | 225 + www.preprints.org | success | 225 + www.protocols.io | | 770 + www.protocols.io | success | 485 + www.repository.cam.ac.uk | | 510 + www.repository.cam.ac.uk | bad-redirect | 213 + www.research-collection.ethz.ch | | 416 + www.research-collection.ethz.ch | bad-redirect | 249 + www.researchsquare.com | | 1121 + www.researchsquare.com | bad-redirect | 985 + www.scielo.br | | 828 + www.scielo.br | success | 641 + www.sciencedirect.com | | 8567 + www.sciencedirect.com | terminal-bad-status | 5773 + www.sciencedirect.com | spn2-wayback-error | 1590 + www.sciencedirect.com | no-pdf-link | 576 + www.sciencedirect.com | spn2-backoff | 479 + www.sciendo.com | | 257 + www.sciendo.com | success | 222 + www.scitepress.org | | 381 + www.scitepress.org | no-pdf-link | 377 + www.spiedigitallibrary.org | | 1061 + www.spiedigitallibrary.org | bad-redirect | 571 + www.spiedigitallibrary.org | gateway-timeout | 233 + www.tandfonline.com | | 4934 + www.tandfonline.com | no-pdf-link | 2088 + www.tandfonline.com | terminal-bad-status | 1282 + www.tandfonline.com | blocked-cookie | 757 + www.tandfonline.com | redirect-loop | 488 + www.tandfonline.com | spn2-wayback-error | 202 + www.taylorfrancis.com | | 3979 + www.taylorfrancis.com | link-loop | 1928 + www.taylorfrancis.com | no-pdf-link | 1840 + www.techniques-ingenieur.fr | | 354 + www.techniques-ingenieur.fr | no-pdf-link | 353 + www.thieme-connect.de | | 1987 + www.thieme-connect.de | no-pdf-link | 949 + www.thieme-connect.de | link-loop | 869 + www.tib.eu | no-pdf-link | 315 + www.tib.eu | | 315 + www.un-ilibrary.org | no-pdf-link | 352 + www.un-ilibrary.org | | 352 + www.worldscientific.com | | 668 + www.worldscientific.com | no-pdf-link | 629 + www.zora.uzh.ch | | 318 + zenodo.org | | 46585 + zenodo.org | no-pdf-link | 29519 + zenodo.org | success | 14768 + zenodo.org | terminal-bad-status | 810 + zenodo.org | wrong-mimetype | 691 + zenodo.org | spn2-cdx-lookup-failure | 395 + zenodo.org | spn2-backoff | 294 + zivahub.uct.ac.za | | 1909 + zivahub.uct.ac.za | no-pdf-link | 1880 + zop.zb.uzh.ch | | 228 + zop.zb.uzh.ch | success | 217 + | | 365582 + | success | 141497 38.7% + | no-pdf-link | 120852 33.0% + | terminal-bad-status | 31900 8.7% + | spn2-backoff | 16979 4.6% + | link-loop | 13624 3.7% + | bad-redirect | 8736 + | redirect-loop | 7405 + | gateway-timeout | 6997 + | spn2-cdx-lookup-failure | 5146 + | spn2-wayback-error | 3708 + | wrong-mimetype | 2158 + | blocked-cookie | 1942 + | spn2-error:blocked-url | 1733 + | wayback-error | 1063 + | spn2-error | 647 + | spn2-error:500 | 265 + | cdx-error | 257 +(383 rows) + +---- + +365k in 7 days is about 52k a day, which is about expected. Around 5-7% need +retries. + +important changes: +- biorxiv.org: needs fix and then retries +- academic.oup.com: should probably skip +- apps.crossref.org: need to handle this in code +- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/` +- doi.org: investigate redirect-loop and terminal-bad-status +- osf.io: not getting PDFs +- papers.ssrn.com: why are these attempted? +- publons.com: not getting PDFs; special case these? +- www.sciencedirect.com: not working at all? + +smaller: +- bridges.monash.edu: fix, then retry? +- dl.acm.org: some broader retries? +- figshare.com: still some attempts, but almost all no-pdf-link +- onlinelibrary.wiley.com: getting blocked broadly? +- www.endocrine-abstracts.org: HTML content? +- www.igi-global.com: no-pdf-link diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 62 GB | 44 GB | 106 GB + "public"."ingest_request" | 51 GB | 50 GB | 101 GB + "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB + "public"."file_meta" | 39 GB | 39 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7469 MB | 66 MB | 7535 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 198175106 | 282695671015403 + (1 row) + + 198 million files, 282 TBytes. + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 197021437 + text/html | 830331 + application/octet-stream | 186669 + application/xml | 42170 + application/xhtml+xml | 38207 + text/plain | 16471 + application/jats+xml | 10385 + application/gzip | 6681 + | 6032 + application/postscript | 4916 + image/jpeg | 4522 + application/vnd.ms-powerpoint | 1672 + application/msword | 946 + application/x-bzip2 | 891 + image/png | 659 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 + application/x-compress | 274 + video/mp4 | 218 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12800 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 137283420 | 172140506 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 157465613 + warc/revisit | 11337336 + text/html | 1137208 + application/octet-stream | 950380 + text/xml | 528965 + unk | 253294 + application/postscript | 81130 + application/save | 81069 + binary/octet-stream | 68942 + application/x-download | 42717 + application/download | 40628 + image/pdf | 39904 + text/plain | 36445 + application/force-download | 24148 + multipart/form-data | 10972 + application | 5409 + application/x-octetstream | 5192 + application/x-msdownload | 3854 + .pdf | 3518 + application/x-pdf | 3061 + application/octet | 1792 + pdf | 1757 + application/binary | 1399 + file | 1373 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1087 + application/unknown | 1005 + 0 | 773 + text/pdf | 729 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 129001717 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 120797098 + 500 | 8198783 + -4 | 5802 + 503 | 36 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 60469462 + 0.5.5-fatcat | 47472904 + | 12665498 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | doi | 43852308 + pdf | mag | 43701948 + pdf | doaj | 6534341 + html | doaj | 3987669 + pdf | arxiv | 2784589 + pdf | pmc | 2439181 + pdf | dblp | 631716 + html | doi | 126699 + xml | doaj | 23066 + pdf | cnki_covid19 | 2034 + pdf | spn | 1026 + pdf | wanfang_covid19 | 975 + html | spn | 65 + xml | spn | 2 + xml | doi | 1 + (17 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 24742500 + pdf | doi | fatcat-ingest | 15592121 + pdf | doaj | doaj | 6484737 + html | doaj | doaj | 3987468 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 799793 + pdf | dblp | dblp | 631716 + pdf | pmc | fatcat-ingest | 297980 + html | doi | fatcat-ingest | 121508 + pdf | pmc | fatcat-changelog | 112376 + pdf | doaj | fatcat-changelog | 47181 + xml | doaj | doaj | 23066 + html | doi | fatcat-changelog | 5129 + pdf | doaj | fatcat-ingest | 2423 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1814 + pdf | spn | savepapernow-web | 1026 + pdf | wanfang_covid19 | scrape-covid19 | 975 + html | doaj | fatcat-ingest | 201 + html | spn | savepapernow-web | 65 + html | doi | savepapernow-web | 62 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 4 + xml | spn | savepapernow-web | 2 + xml | doi | savepapernow-web | 1 + (30 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 167653 + pdf | doaj | 81517 + pdf | oai | 15282 + html | doaj | 1791 + pdf | unpaywall | 270 + pdf | doi | 22 + (6 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 16024068 | 0.313 + pdf | unpaywall | 43932525 | 36045446 | 0.820 + pdf | doi | 43852308 | 14956080 | 0.341 + pdf | mag | 43701948 | 32768484 | 0.750 + pdf | doaj | 6534341 | 4704066 | 0.720 + html | doaj | 3987669 | 778165 | 0.195 + pdf | arxiv | 2784589 | 2419941 | 0.869 + pdf | pmc | 2439181 | 1897671 | 0.778 + pdf | dblp | 631716 | 305142 | 0.483 + html | doi | 126699 | 75754 | 0.598 + xml | doaj | 23066 | 10381 | 0.450 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | spn | 1026 | 778 | 0.758 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + html | spn | 65 | 13 | 0.200 + xml | spn | 2 | 1 | 0.500 + xml | doi | 1 | 0 | 0.000 + (17 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 94887295 + pdf | no-pdf-link | 33960080 + pdf | no-capture | 20893916 + pdf | terminal-bad-status | 6973765 + pdf | redirect-loop | 5775175 + pdf | link-loop | 4095424 + pdf | skip-url-blocklist | 4037518 + pdf | blocked-cookie | 3508762 + html | wrong-scope | 1783694 + pdf | wrong-mimetype | 1379673 + html | success | 853762 + pdf | gateway-timeout | 635170 + html | no-capture | 381283 + pdf | wayback-content-error | 356694 + pdf | cdx-error | 347700 + pdf | null-body | 336166 + html | unknown-scope | 321874 + html | html-resource-no-capture | 294294 + pdf | forbidden | 291127 + pdf | not-found | 274343 + pdf | too-many-redirects | 264494 + component | wrong-mimetype | 196680 + component | spn2-cdx-lookup-failure | 173615 + component | spn2-backoff | 115840 + html | terminal-bad-status | 106264 + html | null-body | 100296 + pdf | wayback-error | 94748 + html | blocked-cookie | 88537 + component | no-capture | 75278 + pdf | empty-blob | 61157 + pdf | bad-redirect | 58680 + pdf | skip-wall | 57751 + pdf | spn2-error:too-many-redirects | 52873 + html | spn2-backoff | 50577 + pdf | remote-server-error | 41282 + pdf | invalid-host-resolution | 38864 + pdf | read-timeout | 37071 + pdf | spn2-cdx-lookup-failure | 34229 + html | wrong-mimetype | 33643 + pdf | spn2-backoff | 32437 + pdf | petabox-error | 31006 + html | wayback-content-error | 28034 + component | spn2-error | 27044 + pdf | spn2-error:unknown | 25810 + component | gateway-timeout | 25215 + pdf | body-too-large | 21721 + html | petabox-error | 18313 + html | empty-blob | 14393 + html | redirect-loop | 13404 + component | blocked-cookie | 12287 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 45052391 + pdf | | 26117481 + pdf | 301 | 4814786 + html | 200 | 2684821 + pdf | 403 | 1871088 + pdf | 404 | 1254259 + pdf | 302 | 898728 + pdf | 503 | 867548 + pdf | 401 | 851205 + pdf | 429 | 741869 + pdf | 400 | 624519 + component | | 456915 + html | | 442051 + pdf | 500 | 283700 + component | 200 | 197510 + pdf | 410 | 120647 + pdf | 303 | 107947 + html | 404 | 80114 + pdf | 420 | 26722 + pdf | 502 | 19500 + pdf | 409 | 15499 + html | 429 | 15208 + pdf | 509 | 15167 + pdf | 999 | 12186 + pdf | 202 | 11535 + html | 301 | 10213 + xml | | 10018 + pdf | 307 | 8657 + pdf | 402 | 8338 + pdf | 412 | 8064 + pdf | 308 | 6479 + html | 500 | 4746 + xml | 200 | 2668 + pdf | 520 | 2496 + html | 302 | 2289 + pdf | 521 | 2257 + html | 202 | 2177 + pdf | 206 | 1961 + html | 403 | 1775 + pdf | 504 | 1187 + pdf | 421 | 1148 + html | 303 | 1112 + pdf | 406 | 1109 + pdf | 204 | 772 + pdf | 432 | 745 + pdf | 405 | 633 + html | 400 | 632 + pdf | 426 | 515 + pdf | 508 | 503 + pdf | 505 | 469 + (50 rows) diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt new file mode 100644 index 0000000..0a6254a --- /dev/null +++ b/sql/stats/2022-11-23_table_sizes.txt @@ -0,0 +1,21 @@ +PostgreSQL 13.2 - wbgrp-svc506.us.archive.org +Size: 1.13T + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 63 GB | 45 GB | 108 GB + "public"."ingest_request" | 53 GB | 52 GB | 105 GB + "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB + "public"."file_meta" | 39 GB | 40 GB | 79 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7879 MB | 68 MB | 7947 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes diff --git a/sql/stats/README.md b/sql/stats/README.md index 62e213c..3161514 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -49,7 +49,7 @@ mimetype counts: Counts: - SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + SELECT COUNT(*) AS total_files FROM grobid; Status? @@ -107,14 +107,3 @@ Failed ingest by terminal status code: SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; -## Fatcat Files - -Count of PDF files that GROBID processed and matched to a release (via -glutton), but no PDF in `fatcat_file`: - - SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count - FROM grobid - LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex - WHERE fatcat_file.sha1hex IS NULL - AND grobid.fatcat_release IS NOT NULL; - |