diff options
Diffstat (limited to 'sql')
41 files changed, 4787 insertions, 56 deletions
diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..860addb --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,35 @@ + +SHELL=/bin/bash -euo pipefail +TODAY ?= $(shell date --iso --utc) +DATADIR ?= /srv/sandcrawler/tasks/$(TODAY) +DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) +DATABASE_URL ?= sandcrawler + +.PHONY: help +help: ## Print info about all commands + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: create_datadir +create_datadir: + mkdir -p $(DATADIR)/ + sudo chmod a+rw $(DATADIR)/ + +$(DATADIR)/.DB_DUMP: + sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip + mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump + touch $@ + +.PHONY: database-snapshot +database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot + @echo + +$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP + ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))" + ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump + touch $@ + +.PHONY: upload-database-snapshot +upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org + @echo diff --git a/sql/README.md b/sql/README.md index 42dba31..e488006 100644 --- a/sql/README.md +++ b/sql/README.md @@ -139,10 +139,30 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 -## Full Database Dumps -Run a dump in compressed, postgres custom format: +## Full SQL Database Dumps + +Run a dump in compressed, postgres custom format, not including `crossref` table (which is large and redundant): export DATESLUG="`date +%Y-%m-%d.%H%M%S`" - time sudo -u postgres pg_dump --verbose --format=custom sandcrawler > /sandcrawler-db/snapshots/sandcrawler_full_dbdump_${DATESLUG}.pgdump + time sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump + +As of 2021-12-03, this process runs for about 6 hours and the compressed +snapshot is 102 GBytes (compared with 940GB database disk consumption, +including crossref). + +Then, upload to petabox as a backup: + + ia upload sandcrawler_full_dbdump_YYYY-MM-DD -m mediatype:data -m collection:webgroup-internal-backups -m title:"Sandcrawler SQL Dump (YYYY-MM-DD)" sandcrawler_full_dbdump_${DATESLUG}.pgdump + + +## SQL Database Restore + +To restore a dump (which will delete local database content, if any): + + sudo su postgres + createuser --no-login web_anon + createuser -s sandcrawler + time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump +Took about 2.5 hours. diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md index f1a5f86..4a56065 100644 --- a/sql/backfill/backfill.md +++ b/sql/backfill/backfill.md @@ -76,6 +76,19 @@ In psql: COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); # => COPY 24727350 +In 2021-11-26: + + zcat file_export.json.gz \ + | pv -l \ + | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \ + | sort -S 8G \ + | uniq -w 40 \ + | pigz \ + > fatcat_file.2021-11-26.tsv.gz + + COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 112086814 + ## `file_meta` zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql new file mode 100644 index 0000000..a7d6c2b --- /dev/null +++ b/sql/dump_file_meta.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + ORDER BY sha1hex ASC +) +TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql index 3ca8085..e7c48f3 100644 --- a/sql/dump_regrobid_pdf_petabox.sql +++ b/sql/dump_regrobid_pdf_petabox.sql @@ -9,7 +9,7 @@ COPY ( SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) ) -TO '/grande/snapshots/dump_regrobid_pdf_petabox.2020-02-03.json' +TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql new file mode 100644 index 0000000..698db7a --- /dev/null +++ b/sql/dump_reingest_bulk.sql @@ -0,0 +1,31 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '24 hour'::INTERVAL + AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql new file mode 100644 index 0000000..7473420 --- /dev/null +++ b/sql/dump_reingest_old.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 day'::INTERVAL + -- AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status like 'no-capture' + -- OR ingest_file_result.status like 'cdx-error' + -- OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql new file mode 100644 index 0000000..dbeb199 --- /dev/null +++ b/sql/dump_reingest_quarterly.sql @@ -0,0 +1,47 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_quarterly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql new file mode 100644 index 0000000..a83125c --- /dev/null +++ b/sql/dump_reingest_spn.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created > NOW() - '180 day'::INTERVAL + AND ingest_request.ingest_request_source = 'savepapernow-web' + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql new file mode 100644 index 0000000..b72a096 --- /dev/null +++ b/sql/dump_reingest_terminalstatus.sql @@ -0,0 +1,34 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '72 hour'::INTERVAL + AND ingest_request.created > NOW() - '10 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 404 + ) + AND ( + ingest_request.base_url LIKE 'https://doi.org/10.3390/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' + ) +) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql new file mode 100644 index 0000000..a019938 --- /dev/null +++ b/sql/dump_reingest_weekly.sql @@ -0,0 +1,42 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '8 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql index 4d264b9..a7fb920 100644 --- a/sql/dump_unextracted_pdf.sql +++ b/sql/dump_unextracted_pdf.sql @@ -1,16 +1,22 @@ -- Run like: --- psql sandcrawler < dump_unextracted_pdf.sql > dump_unextracted_pdf.2019-09-23.json +-- psql sandcrawler < dump_unextracted_pdf.sql BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; COPY ( - SELECT row_to_json(cdx) + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) FROM grobid LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL ) --- TO '/grande/snapshots/dump_unextracted_pdf.2020-06-25.json'; -TO STDOUT +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql new file mode 100644 index 0000000..bb9f162 --- /dev/null +++ b/sql/dump_unextracted_pdf_petabox.sql @@ -0,0 +1,18 @@ + +-- Run like: +-- psql sandcrawler < dump_unextracted_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL +) +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql index 3e6d782..81caf18 100644 --- a/sql/dump_ungrobid_pdf.sql +++ b/sql/dump_ungrobid_pdf.sql @@ -1,15 +1,18 @@ -- Run like: --- psql sandcrawler < dump_ungrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_ungrobid_pdf.2019-09-23.json +-- psql sandcrawler < dump_ungrobid_pdf.sql BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; COPY ( - SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx - WHERE cdx.mimetype = 'application/pdf' - AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex) + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) ) -TO STDOUT +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' WITH NULL ''; ROLLBACK; diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql new file mode 100644 index 0000000..b7a1db2 --- /dev/null +++ b/sql/dump_ungrobid_pdf_petabox.sql @@ -0,0 +1,17 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM petabox + WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) +) +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql new file mode 100644 index 0000000..333ff7b --- /dev/null +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -0,0 +1,19 @@ + +-- Run like: +-- psql sandcrawler < THING.sql > THING.2019-09-23.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(grobid) + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL + LIMIT 1000 +) +TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; +--TO STDOUT +--WITH NULL ''; + +ROLLBACK; diff --git a/sql/ingest_again.md b/sql/ingest_again.md index 3b4b990..b749557 100644 --- a/sql/ingest_again.md +++ b/sql/ingest_again.md @@ -12,7 +12,7 @@ AND ingest_file_result.status like 'spn2-%' AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' - ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -25,7 +25,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -38,7 +38,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source != 'fatcat-changelog' AND ingest_request.ingest_request_source != 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -49,7 +49,7 @@ AND ingest_file_result.status like 'wayback-error' AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL - ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -62,7 +62,7 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json'; COPY ( SELECT row_to_json(ingest_request.*) FROM ingest_request @@ -75,16 +75,16 @@ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL AND (ingest_request.ingest_request_source = 'fatcat-changelog' OR ingest_request.ingest_request_source = 'fatcat-ingest') - ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json'; Transform: - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json Push to kafka (shuffled): @@ -122,10 +122,10 @@ Push to kafka (not shuffled): AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' AND ingest_request.ingest_request_source = 'fatcat-ingest' - ) TO '/grande/snapshots/reingest_fatcat_current.rows.json'; + ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json'; # note: shuf - ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt new file mode 100644 index 0000000..b684400 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt @@ -0,0 +1,326 @@ + doi_prefix | status | count +------------+-------------------------------+-------- + 10.1001 | | 230 + 10.1002 | | 3914 + 10.1002 | terminal-bad-status | 1540 + 10.1002 | forbidden | 1072 + 10.1002 | redirect-loop | 995 + 10.1002 | no-pdf-link | 210 + 10.1016 | | 7976 + 10.1016 | no-pdf-link | 4648 + 10.1016 | terminal-bad-status | 1778 + 10.1016 | forbidden | 622 + 10.1016 | spn2-error:too-many-redirects | 344 + 10.1016 | redirect-loop | 225 + 10.1017 | | 2040 + 10.1017 | no-pdf-link | 720 + 10.1017 | success | 441 + 10.1017 | link-loop | 371 + 10.1017 | bad-redirect | 227 + 10.1021 | | 1722 + 10.1021 | blocked-cookie | 1552 + 10.1029 | | 248 + 10.1039 | | 1160 + 10.1039 | redirect-loop | 486 + 10.1039 | spn2-error:too-many-redirects | 395 + 10.1039 | spn2-wayback-error | 213 + 10.1051 | | 695 + 10.1051 | success | 557 + 10.1055 | | 541 + 10.1055 | not-found | 295 + 10.1055 | redirect-loop | 213 + 10.1057 | | 2835 + 10.1057 | redirect-loop | 2617 + 10.1061 | | 550 + 10.1061 | spn2-error:too-many-redirects | 425 + 10.1063 | | 600 + 10.1063 | spn2-error:too-many-redirects | 328 + 10.1080 | | 3801 + 10.1080 | blocked-cookie | 2431 + 10.1080 | terminal-bad-status | 711 + 10.1080 | forbidden | 341 + 10.1081 | | 299 + 10.1081 | link-loop | 222 + 10.1089 | | 236 + 10.1089 | blocked-cookie | 228 + 10.1093 | | 12805 + 10.1093 | link-loop | 8627 + 10.1093 | redirect-loop | 1659 + 10.1093 | no-pdf-link | 1475 + 10.1093 | bad-redirect | 428 + 10.1093 | success | 391 + 10.1097 | | 1497 + 10.1097 | no-pdf-link | 503 + 10.1097 | link-loop | 346 + 10.1097 | spn2-error:too-many-redirects | 259 + 10.1097 | terminal-bad-status | 202 + 10.1101 | | 1859 + 10.1101 | redirect-loop | 993 + 10.1101 | forbidden | 703 + 10.1103 | | 597 + 10.1103 | not-found | 534 + 10.1108 | | 1055 + 10.1108 | no-pdf-link | 945 + 10.1109 | | 7067 + 10.1109 | spn2-error:too-many-redirects | 6299 + 10.1109 | success | 667 + 10.1111 | | 2099 + 10.1111 | redirect-loop | 1331 + 10.1111 | terminal-bad-status | 313 + 10.1111 | forbidden | 226 + 10.1115 | | 1278 + 10.1115 | bad-redirect | 707 + 10.1117 | | 561 + 10.1117 | spn2-error:too-many-redirects | 501 + 10.1126 | | 214 + 10.1136 | | 1989 + 10.1136 | success | 1463 + 10.1136 | link-loop | 294 + 10.1142 | | 300 + 10.1142 | blocked-cookie | 237 + 10.1145 | | 440 + 10.1145 | blocked-cookie | 354 + 10.1155 | | 480 + 10.1155 | success | 474 + 10.11588 | | 506 + 10.11588 | no-pdf-link | 264 + 10.11588 | success | 236 + 10.1159 | | 226 + 10.11606 | | 304 + 10.1161 | | 1142 + 10.1161 | blocked-cookie | 1011 + 10.1163 | | 2261 + 10.1163 | link-loop | 1767 + 10.1163 | success | 348 + 10.11648 | | 405 + 10.11648 | success | 404 + 10.1182 | | 2125 + 10.1182 | no-pdf-link | 2024 + 10.1183 | | 987 + 10.1183 | redirect-loop | 838 + 10.1186 | | 1481 + 10.1186 | success | 1412 + 10.1201 | | 7649 + 10.1201 | link-loop | 5383 + 10.1201 | forbidden | 1504 + 10.1201 | no-pdf-link | 312 + 10.1299 | | 264 + 10.1299 | no-pdf-link | 209 + 10.13134 | | 201 + 10.1353 | | 549 + 10.1353 | terminal-bad-status | 443 + 10.1371 | | 552 + 10.1371 | success | 542 + 10.14201 | | 656 + 10.14201 | success | 366 + 10.14361 | | 647 + 10.14361 | link-loop | 585 + 10.14746 | | 260 + 10.14746 | success | 232 + 10.1504 | | 527 + 10.1504 | no-pdf-link | 501 + 10.15122 | | 246 + 10.15122 | success | 243 + 10.1515 | | 16240 + 10.1515 | link-loop | 12589 + 10.1515 | success | 1941 + 10.1515 | no-pdf-link | 1008 + 10.1515 | not-found | 283 + 10.15405 | | 229 + 10.15405 | success | 218 + 10.1553 | | 418 + 10.1553 | no-pdf-link | 396 + 10.1590 | | 655 + 10.1590 | success | 623 + 10.17104 | | 1202 + 10.17104 | no-pdf-link | 953 + 10.17104 | bad-redirect | 249 + 10.17605 | | 368 + 10.17605 | not-found | 337 + 10.17615 | | 9401 + 10.17615 | redirect-loop | 5720 + 10.17615 | spn2-wayback-error | 3099 + 10.17615 | spn2-cdx-lookup-failure | 201 + 10.17863 | | 438 + 10.18148 | | 465 + 10.18148 | success | 462 + 10.18720 | | 210 + 10.18821 | | 476 + 10.18821 | redirect-loop | 366 + 10.20345 | | 222 + 10.20345 | terminal-bad-status | 215 + 10.20546 | | 244 + 10.20546 | no-pdf-link | 241 + 10.21037 | | 232 + 10.2118 | | 903 + 10.2118 | redirect-loop | 853 + 10.21203 | | 1824 + 10.21203 | success | 1545 + 10.2139 | | 1493 + 10.2139 | link-loop | 1145 + 10.2147 | | 318 + 10.2147 | success | 267 + 10.2172 | | 282 + 10.2174 | | 363 + 10.2174 | no-pdf-link | 320 + 10.2196 | | 265 + 10.2208 | | 299 + 10.22215 | | 218 + 10.22215 | success | 217 + 10.22323 | | 289 + 10.22323 | success | 262 + 10.22533 | | 395 + 10.22533 | success | 393 + 10.22541 | | 291 + 10.22541 | success | 275 + 10.23919 | | 426 + 10.23919 | spn2-error:too-many-redirects | 403 + 10.24034 | | 319 + 10.24034 | spn2-error | 203 + 10.24355 | | 15360 + 10.24355 | no-pdf-link | 15228 + 10.24411 | | 1506 + 10.24411 | forbidden | 823 + 10.24411 | redirect-loop | 647 + 10.25335 | | 550 + 10.25335 | no-pdf-link | 550 + 10.25365 | | 429 + 10.25365 | success | 424 + 10.25384 | | 338 + 10.25384 | success | 249 + 10.25646 | | 239 + 10.26197 | no-pdf-link | 303 + 10.26197 | | 303 + 10.26226 | | 272 + 10.26278 | | 1291 + 10.26278 | redirect-loop | 756 + 10.26278 | spn2-error:too-many-redirects | 509 + 10.29327 | | 232 + 10.2991 | | 307 + 10.2991 | spn2-wayback-error | 227 + 10.30965 | | 722 + 10.30965 | link-loop | 709 + 10.3109 | | 801 + 10.3109 | link-loop | 572 + 10.3109 | forbidden | 228 + 10.31219 | | 951 + 10.31219 | redirect-loop | 518 + 10.31219 | spn2-wayback-error | 356 + 10.31274 | | 296 + 10.31743 | | 403 + 10.31743 | success | 294 + 10.31857 | | 209 + 10.3233 | | 471 + 10.33448 | | 213 + 10.33448 | success | 212 + 10.3389 | | 1459 + 10.3389 | success | 1417 + 10.3390 | | 4511 + 10.3390 | success | 3577 + 10.3390 | terminal-bad-status | 485 + 10.3390 | forbidden | 379 + 10.3406 | | 243 + 10.3406 | terminal-bad-status | 213 + 10.34944 | | 527 + 10.34944 | success | 459 + 10.35016 | | 688 + 10.35016 | no-pdf-link | 687 + 10.36347 | success | 213 + 10.36347 | | 213 + 10.37747 | | 213 + 10.37747 | no-pdf-link | 213 + 10.37904 | | 227 + 10.37904 | no-pdf-link | 226 + 10.3917 | | 347 + 10.3917 | redirect-loop | 208 + 10.3923 | | 356 + 10.3923 | redirect-loop | 254 + 10.3929 | | 317 + 10.3929 | terminal-bad-status | 310 + 10.3931 | | 279 + 10.3931 | no-pdf-link | 279 + 10.4000 | | 7828 + 10.4000 | success | 3485 + 10.4000 | spn2-wayback-error | 2142 + 10.4000 | redirect-loop | 2106 + 10.4018 | | 249 + 10.4018 | not-found | 240 + 10.4103 | | 726 + 10.4103 | remote-server-error | 343 + 10.4103 | redirect-loop | 324 + 10.4159 | | 286 + 10.4159 | link-loop | 238 + 10.4324 | | 19398 + 10.4324 | link-loop | 12471 + 10.4324 | forbidden | 3632 + 10.4324 | not-found | 2283 + 10.4324 | terminal-bad-status | 645 + 10.4324 | success | 208 + 10.47295 | | 456 + 10.47295 | success | 449 + 10.47513 | | 218 + 10.47513 | no-pdf-link | 203 + 10.48084 | success | 538 + 10.48084 | | 538 + 10.5040 | | 375 + 10.5040 | no-pdf-link | 365 + 10.5167 | | 290 + 10.5167 | redirect-loop | 278 + 10.5169 | | 360 + 10.5169 | no-pdf-link | 355 + 10.5194 | | 917 + 10.5194 | success | 887 + 10.5216 | | 213 + 10.5220 | no-pdf-link | 397 + 10.5220 | | 397 + 10.5281 | | 22551 + 10.5281 | terminal-bad-status | 12158 + 10.5281 | success | 4901 + 10.5281 | no-pdf-link | 4754 + 10.5281 | spn2-error:unknown | 360 + 10.5282 | | 228 + 10.5451 | | 2068 + 10.5451 | success | 1071 + 10.5451 | terminal-bad-status | 817 + 10.5753 | | 268 + 10.5753 | success | 264 + 10.5771 | | 941 + 10.5771 | no-pdf-link | 397 + 10.5771 | bad-redirect | 269 + 10.5771 | link-loop | 238 + 10.6068 | | 441 + 10.6068 | no-pdf-link | 384 + 10.6084 | | 917 + 10.6084 | no-pdf-link | 520 + 10.6084 | success | 368 + 10.7287 | | 234 + 10.7287 | no-pdf-link | 212 + 10.7312 | | 382 + 10.7312 | link-loop | 291 + 10.7554 | | 205 + 10.7891 | | 380 + 10.7891 | no-pdf-link | 376 + 10.7916 | | 331 + 10.7916 | no-pdf-link | 201 + 10.7939 | | 535 + 10.7939 | no-pdf-link | 527 + | | 272831 + | success | 62298 + | no-pdf-link | 60737 + | link-loop | 48558 + | redirect-loop | 26842 + | terminal-bad-status | 22685 + | spn2-error:too-many-redirects | 11174 + | forbidden | 10900 + | spn2-wayback-error | 7796 + | blocked-cookie | 6961 + | not-found | 5468 + | bad-redirect | 2666 + | spn2-error | 2398 + | spn2-cdx-lookup-failure | 1374 + | petabox-error | 678 + | remote-server-error | 461 + | wrong-mimetype | 443 + | spn2-error:proxy-error | 420 + | spn2-error:unknown | 360 +(323 rows) diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt new file mode 100644 index 0000000..28dd0d0 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt @@ -0,0 +1,307 @@ + domain | status | count +-------------------------------------------------------------------+-------------------------------+-------- + 202.148.31.178 | | 298 + academic.oup.com | | 1624 + academic.oup.com | no-pdf-link | 673 + academic.oup.com | bad-redirect | 444 + academic.oup.com | link-loop | 358 + aip.scitation.org | | 257 + apps.crossref.org | | 1414 + apps.crossref.org | no-pdf-link | 1410 + article.sciencepublishinggroup.com | | 404 + article.sciencepublishinggroup.com | success | 404 + arxiv.org | | 24340 + arxiv.org | success | 22381 + arxiv.org | terminal-bad-status | 1260 + arxiv.org | no-pdf-link | 412 + arxiv.org | no-capture | 262 + ashpublications.org | | 2049 + ashpublications.org | no-pdf-link | 2024 + asmedigitalcollection.asme.org | | 1245 + asmedigitalcollection.asme.org | bad-redirect | 707 + assets.researchsquare.com | | 1549 + assets.researchsquare.com | success | 1546 + bioone.org | | 201 + biorxiv.org | redirect-loop | 702 + biorxiv.org | | 702 + blogs.ethz.ch | | 687 + blogs.ethz.ch | no-pdf-link | 686 + books.openedition.org | | 446 + books.openedition.org | redirect-loop | 382 + brill.com | | 2203 + brill.com | link-loop | 1779 + brill.com | success | 359 + catalog.paradisec.org.au | | 770 + catalog.paradisec.org.au | redirect-loop | 756 + cdr.lib.unc.edu | | 9432 + cdr.lib.unc.edu | redirect-loop | 5720 + cdr.lib.unc.edu | spn2-wayback-error | 3187 + cdr.lib.unc.edu | spn2-cdx-lookup-failure | 201 + classiques-garnier.com | | 246 + classiques-garnier.com | success | 243 + content.iospress.com | | 242 + content.taylorfrancis.com | | 309 + content.taylorfrancis.com | terminal-bad-status | 309 + curve.carleton.ca | success | 201 + curve.carleton.ca | | 201 + cyberdoi.ru | redirect-loop | 647 + cyberdoi.ru | | 647 + czasopisma.kul.pl | | 402 + czasopisma.kul.pl | success | 294 + d.lib.msu.edu | | 550 + d.lib.msu.edu | no-pdf-link | 550 + d197for5662m48.cloudfront.net | success | 276 + d197for5662m48.cloudfront.net | | 276 + dergipark.org.tr | | 674 + dergipark.org.tr | no-pdf-link | 255 + dergipark.org.tr | success | 248 + digi.ub.uni-heidelberg.de | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 261 + dl.acm.org | | 441 + dl.acm.org | blocked-cookie | 361 + dlc.library.columbia.edu | | 201 + dlc.library.columbia.edu | no-pdf-link | 201 + doi.ala.org.au | | 308 + doi.ala.org.au | no-pdf-link | 308 + doi.org | | 474 + doi.org | terminal-bad-status | 344 + downloads.hindawi.com | | 479 + downloads.hindawi.com | success | 478 + edoc.rki.de | | 238 + edoc.unibas.ch | | 2018 + edoc.unibas.ch | success | 1067 + edoc.unibas.ch | terminal-bad-status | 817 + elib.spbstu.ru | | 205 + elifesciences.org | | 204 + era.library.ualberta.ca | | 531 + era.library.ualberta.ca | no-pdf-link | 527 + erj.ersjournals.com | | 951 + erj.ersjournals.com | redirect-loop | 829 + europepmc.org | | 289 + europepmc.org | success | 283 + figshare.com | | 233 + figshare.com | no-pdf-link | 208 + fjfsdata01prod.blob.core.windows.net | | 1430 + fjfsdata01prod.blob.core.windows.net | success | 1418 + hw.oeaw.ac.at | | 283 + hw.oeaw.ac.at | no-pdf-link | 283 + idb.ub.uni-tuebingen.de | | 216 + idb.ub.uni-tuebingen.de | terminal-bad-status | 215 + ieeexplore.ieee.org | | 7561 + ieeexplore.ieee.org | spn2-error:too-many-redirects | 6732 + ieeexplore.ieee.org | success | 683 + ijgc.bmj.com | | 411 + ijgc.bmj.com | success | 399 + jamanetwork.com | | 229 + jitc.bmj.com | | 849 + jitc.bmj.com | success | 773 + journals.aps.org | | 539 + journals.aps.org | not-found | 534 + journals.lww.com | | 1124 + journals.lww.com | no-pdf-link | 547 + journals.lww.com | link-loop | 399 + journals.openedition.org | | 7366 + journals.openedition.org | success | 3484 + journals.openedition.org | spn2-wayback-error | 2120 + journals.openedition.org | redirect-loop | 1720 + journals.plos.org | | 552 + journals.plos.org | success | 542 + kiss.kstudy.com | | 306 + kiss.kstudy.com | no-pdf-link | 292 + lib.dr.iastate.edu | | 297 + link.springer.com | | 2830 + link.springer.com | redirect-loop | 2625 + linkinghub.elsevier.com | | 970 + linkinghub.elsevier.com | forbidden | 415 + linkinghub.elsevier.com | spn2-error:too-many-redirects | 357 + medrxiv.org | | 287 + medrxiv.org | redirect-loop | 287 + muse.jhu.edu | | 470 + muse.jhu.edu | terminal-bad-status | 443 + ojs.ub.uni-konstanz.de | | 463 + ojs.ub.uni-konstanz.de | success | 462 + onlinelibrary.wiley.com | | 2064 + onlinelibrary.wiley.com | terminal-bad-status | 1973 + osf.io | | 1394 + osf.io | redirect-loop | 589 + osf.io | spn2-wayback-error | 425 + osf.io | not-found | 342 + othes.univie.ac.at | | 424 + othes.univie.ac.at | success | 424 + oxford.universitypressscholarship.com | | 8999 + oxford.universitypressscholarship.com | link-loop | 8282 + oxford.universitypressscholarship.com | no-pdf-link | 695 + oxfordhandbooks.com | redirect-loop | 460 + oxfordhandbooks.com | | 460 + papers.ssrn.com | | 1313 + papers.ssrn.com | link-loop | 1145 + peerj.com | | 313 + peerj.com | no-pdf-link | 212 + periodicos.urca.br | | 446 + periodicos.urca.br | success | 439 + pos.sissa.it | | 277 + pos.sissa.it | success | 262 + preprints.jmir.org | | 242 + pressto.amu.edu.pl | | 260 + pressto.amu.edu.pl | success | 232 + publikationsserver.tu-braunschweig.de | | 15358 + publikationsserver.tu-braunschweig.de | no-pdf-link | 15228 + publons.com | | 2810 + publons.com | redirect-loop | 2359 + publons.com | no-pdf-link | 444 + pubs.acs.org | | 1647 + pubs.acs.org | blocked-cookie | 1553 + pubs.rsc.org | | 765 + pubs.rsc.org | redirect-loop | 486 + pubs.rsc.org | spn2-wayback-error | 214 + res.mdpi.com | | 3620 + res.mdpi.com | success | 3591 + revistas.usal.es | | 580 + revistas.usal.es | success | 298 + revues.imist.ma | | 229 + rsdjournal.org | | 213 + rsdjournal.org | success | 212 + s3-eu-west-1.amazonaws.com | | 764 + s3-eu-west-1.amazonaws.com | success | 763 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | | 324 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | success | 324 + saspublishers.com | | 213 + saspublishers.com | success | 213 + scholarshare.temple.edu | | 524 + scholarshare.temple.edu | success | 464 + sol.sbc.org.br | | 268 + sol.sbc.org.br | success | 264 + statisticaldatasets.data-planet.com | | 442 + statisticaldatasets.data-planet.com | no-pdf-link | 390 + watermark.silverchair.com | | 521 + watermark.silverchair.com | success | 514 + www.ahajournals.org | | 1061 + www.ahajournals.org | blocked-cookie | 1011 + www.atlantis-press.com | | 308 + www.atlantis-press.com | spn2-wayback-error | 228 + www.beck-elibrary.de | | 1202 + www.beck-elibrary.de | no-pdf-link | 953 + www.beck-elibrary.de | bad-redirect | 249 + www.cairn.info | | 255 + www.cairn.info | redirect-loop | 208 + www.cambridge.org | | 2061 + www.cambridge.org | no-pdf-link | 727 + www.cambridge.org | success | 485 + www.cambridge.org | link-loop | 388 + www.cambridge.org | bad-redirect | 252 + www.confer.cz | | 227 + www.confer.cz | no-pdf-link | 226 + www.dbpia.co.kr | | 773 + www.dbpia.co.kr | no-pdf-link | 679 + www.degruyter.com | | 17046 + www.degruyter.com | link-loop | 14202 + www.degruyter.com | success | 2201 + www.degruyter.com | not-found | 235 + www.dovepress.com | | 316 + www.dovepress.com | success | 267 + www.e-manuscripta.ch | | 384 + www.e-manuscripta.ch | no-pdf-link | 383 + www.e-periodica.ch | | 358 + www.e-periodica.ch | no-pdf-link | 355 + www.e-rara.ch | no-pdf-link | 279 + www.e-rara.ch | | 279 + www.e3s-conferences.org | | 426 + www.e3s-conferences.org | success | 419 + www.elibrary.ru | | 303 + www.elibrary.ru | no-pdf-link | 301 + www.emerald.com | | 943 + www.emerald.com | no-pdf-link | 933 + www.etasr.com | | 466 + www.etasr.com | success | 466 + www.eurekaselect.com | | 345 + www.eurekaselect.com | no-pdf-link | 321 + www.europeanproceedings.com | | 218 + www.europeanproceedings.com | success | 218 + www.finersistemas.com | success | 397 + www.finersistemas.com | | 397 + www.humankineticslibrary.com | no-pdf-link | 321 + www.humankineticslibrary.com | | 321 + www.ijcmas.com | | 251 + www.ijcmas.com | no-pdf-link | 248 + www.inderscience.com | | 524 + www.inderscience.com | no-pdf-link | 501 + www.ingentaconnect.com | | 366 + www.ingentaconnect.com | no-pdf-link | 349 + www.jstage.jst.go.jp | | 1591 + www.jstage.jst.go.jp | success | 862 + www.jstage.jst.go.jp | no-pdf-link | 567 + www.jstor.org | | 351 + www.karger.com | | 224 + www.liebertpub.com | | 236 + www.liebertpub.com | blocked-cookie | 228 + www.mdpi.com | | 694 + www.mdpi.com | terminal-bad-status | 480 + www.medlit.ru | | 458 + www.medlit.ru | redirect-loop | 366 + www.morressier.com | | 285 + www.morressier.com | no-pdf-link | 253 + www.njca.info | | 223 + www.njca.info | remote-server-error | 222 + www.nomos-elibrary.de | | 913 + www.nomos-elibrary.de | no-pdf-link | 379 + www.nomos-elibrary.de | bad-redirect | 265 + www.nomos-elibrary.de | link-loop | 236 + www.onepetro.org | | 895 + www.onepetro.org | redirect-loop | 853 + www.osti.gov | | 212 + www.persee.fr | | 232 + www.persee.fr | terminal-bad-status | 213 + www.repository.cam.ac.uk | | 439 + www.research-collection.ethz.ch | | 312 + www.research-collection.ethz.ch | terminal-bad-status | 310 + www.revistas.ufg.br | | 212 + www.schoeningh.de | | 371 + www.schoeningh.de | link-loop | 366 + www.scialert.net | | 276 + www.scialert.net | redirect-loop | 254 + www.scielo.br | | 644 + www.scielo.br | success | 624 + www.sciencedirect.com | | 6523 + www.sciencedirect.com | no-pdf-link | 4668 + www.sciencedirect.com | terminal-bad-status | 1737 + www.scitepress.org | no-pdf-link | 397 + www.scitepress.org | | 397 + www.tandfonline.com | | 3448 + www.tandfonline.com | blocked-cookie | 2446 + www.tandfonline.com | terminal-bad-status | 714 + www.taylorfrancis.com | | 21292 + www.taylorfrancis.com | link-loop | 18648 + www.taylorfrancis.com | forbidden | 2022 + www.taylorfrancis.com | terminal-bad-status | 518 + www.thieme-connect.de | | 513 + www.thieme-connect.de | not-found | 292 + www.thieme-connect.de | redirect-loop | 213 + www.whateveryoneneedstoknow.com | | 1174 + www.whateveryoneneedstoknow.com | redirect-loop | 1163 + www.worldscientific.com | | 293 + www.worldscientific.com | blocked-cookie | 240 + www.zora.uzh.ch | | 290 + www.zora.uzh.ch | redirect-loop | 278 + zenodo.org | | 22202 + zenodo.org | terminal-bad-status | 12158 + zenodo.org | success | 4923 + zenodo.org | no-pdf-link | 4788 + | | 280719 + | success | 85143 + | no-pdf-link | 61335 + | link-loop | 48566 + | redirect-loop | 26845 + | terminal-bad-status | 23955 + | spn2-wayback-error | 7920 + | spn2-error:too-many-redirects | 7175 + | blocked-cookie | 6980 + | forbidden | 2912 + | bad-redirect | 2666 + | spn2-error | 1943 + | not-found | 1762 + | spn2-cdx-lookup-failure | 1376 + | wrong-mimetype | 467 + | remote-server-error | 388 + | spn2-error:proxy-error | 295 + | no-capture | 262 +(304 rows) diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 59423dd..33dba66 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -42,7 +42,9 @@ CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); CREATE TABLE IF NOT EXISTS fatcat_file ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), file_ident TEXT CHECK (octet_length(file_ident) = 26), - first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26), + any_url BOOLEAN, + content_scope TEXT CHECK (octet_length(content_scope) >= 1) ); CREATE TABLE IF NOT EXISTS petabox ( @@ -114,6 +116,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta ( -- encrypted ); +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + biblio JSONB, + resources JSONB + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), @@ -128,10 +144,12 @@ CREATE TABLE IF NOT EXISTS ingest_request ( -- ext_ids (source/source_id sometimes enough) -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) -- edit_extra + -- ingest type can be: pdf, xml, html PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); +CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created); CREATE TABLE IF NOT EXISTS ingest_file_result ( ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), @@ -139,7 +157,7 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, hit BOOLEAN NOT NULL, - status TEXT CHECK (octet_length(terminal_url) >= 1), + status TEXT CHECK (octet_length(status) >= 1), terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), terminal_status_code INT, @@ -150,6 +168,43 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); +CREATE TABLE IF NOT EXISTS ingest_fileset_platform ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + + platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1), + platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1), + platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1), + ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1), + total_size BIGINT, + file_count BIGINT, + archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1), + + archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1), + web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1), + web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14), + + manifest JSONB, + -- list, similar to fatcat fileset manifest, plus extra: + -- status (str) + -- path (str) + -- size (int) + -- md5 (str) + -- sha1 (str) + -- sha256 (str) + -- mimetype (str) + -- extra (dict) + -- platform_url (str) + -- terminal_url (str) + -- terminal_dt (str) + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id); + CREATE TABLE IF NOT EXISTS shadow ( shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), @@ -160,3 +215,31 @@ CREATE TABLE IF NOT EXISTS shadow ( PRIMARY KEY(shadow_corpus, shadow_id) ); CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); + +CREATE TABLE IF NOT EXISTS grobid_refs ( + source TEXT NOT NULL CHECK (octet_length(source) >= 1), + source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1), + source_ts TIMESTAMP WITH TIME ZONE, + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + refs_json JSON NOT NULL, + PRIMARY KEY(source, source_id) +); + +CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS + SELECT + crossref.doi as doi, + crossref.indexed as indexed, + crossref.record as record, + grobid_refs.source_ts as source_ts, + grobid_refs.refs_json as refs_json + FROM crossref + LEFT JOIN grobid_refs ON + grobid_refs.source_id = crossref.doi + AND grobid_refs.source = 'crossref'; diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md index fd0ce51..0859e79 100644 --- a/sql/monitoring_queries.md +++ b/sql/monitoring_queries.md @@ -1,21 +1,21 @@ ## fatcat-changelog pipeline -Overall ingest status, past 3 days: +Overall ingest status, past 30 days: SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) FROM ingest_file_result LEFT JOIN ingest_request ON ingest_file_result.ingest_type = ingest_request.ingest_type AND ingest_file_result.base_url = ingest_request.base_url - WHERE ingest_request.created >= NOW() - '3 day'::INTERVAL + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' GROUP BY ingest_file_result.ingest_type, ingest_file_result.status ORDER BY COUNT DESC LIMIT 20; -Broken domains, past 3 days: +Broken domains, past 30 days: SELECT domain, status, COUNT((domain, status)) FROM ( @@ -29,7 +29,7 @@ Broken domains, past 3 days: AND ingest_file_result.base_url = ingest_request.base_url WHERE -- ingest_request.created >= NOW() - '3 day'::INTERVAL - ingest_file_result.updated >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL AND ingest_request.ingest_type = 'pdf' AND ingest_request.ingest_request_source = 'fatcat-changelog' ) t1 @@ -39,7 +39,59 @@ Broken domains, past 3 days: ORDER BY COUNT DESC LIMIT 25; -Throughput per day, and success, for past month: +Summary of significant domains and status, past 7 days: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + +Summary of DOI prefix and status, past 7 days: + + SELECT doi_prefix, status, count + FROM ( + SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_request.link_source = 'doi' + ) t1 + WHERE t1.doi_prefix != '' + GROUP BY CUBE (doi_prefix, status) + ) t2 + WHERE count > 200 + ORDER BY doi_prefix ASC , count DESC; + + +Throughput per day, and success, for past 30 days: SELECT ingest_request.ingest_type, date(ingest_request.created), @@ -116,3 +168,35 @@ Overall status, updated requests past 3 days: GROUP BY ingest_request.ingest_type, ingest_file_result.status ORDER BY COUNT(*) DESC; +## savepapernow and fatcat-ingest recent status + +Specific recent ingests (for debugging): + + -- for record layout: \x + SELECT + ingest_file_result.status as status, + ingest_request.ingest_type as ingest_type, + ingest_request.ingest_request_source as source, + ingest_request.link_source_id as source_id, + ingest_request.base_url as base_url, + ingest_file_result.terminal_dt as dt, + ingest_file_result.terminal_status_code as status_code, + ingest_file_result.terminal_sha1hex as sha1hex, + grobid.status as grobid_status + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid + ON ingest_file_result.terminal_sha1hex = grobid.sha1hex + WHERE + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + -- AND ingest_request.ingest_type = 'pdf' + -- AND ingest_request.ingest_type = 'html' + AND ( + ingest_request.ingest_request_source = 'savepapernow-web' + -- OR ingest_request.ingest_request_source = 'fatcat-ingest' + ) + ORDER BY ingest_file_result.updated DESC + LIMIT 100; + diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh new file mode 100755 index 0000000..d39a171 --- /dev/null +++ b/sql/reingest_bulk.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_bulk_current.json + +cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh new file mode 100755 index 0000000..96e5416 --- /dev/null +++ b/sql/reingest_old.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_old.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_old_current.json + +cat /srv/sandcrawler/tasks/reingest_old_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh new file mode 100755 index 0000000..8a2996c --- /dev/null +++ b/sql/reingest_quarterly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_quarterly_current.json + +cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ + | shuf \ + | head -n120000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh new file mode 100755 index 0000000..c693a64 --- /dev/null +++ b/sql/reingest_spn.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_spn.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \ + > /srv/sandcrawler/tasks/reingest_spn.json + +cat /srv/sandcrawler/tasks/reingest_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh new file mode 100755 index 0000000..5cb6d51 --- /dev/null +++ b/sql/reingest_terminalstatus_forcerecrawl.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json + +cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \ + | shuf \ + | head -n100000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh new file mode 100755 index 0000000..d2e2444 --- /dev/null +++ b/sql/reingest_weekly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_weekly_current.json + +cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ + | shuf \ + | head -n80000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt new file mode 100644 index 0000000..d1993fc --- /dev/null +++ b/sql/stats/2020-07-23_stats.txt @@ -0,0 +1,347 @@ + +Summary: + +- very many more PDFs have been grobid-ed vs. pdf_meta-ed +- about 1 million file_meta still have partial metadata (eg, no sha256) +- database size still under 0.5 TByte +- there are about a million CDX error ingest requests, and hundreds of + thousands of SPN errors which could be re-run + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 42 GB | 84 GB + "public"."ingest_request" | 34 GB | 39 GB | 73 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 61 GB | 7742 MB | 69 GB + "public"."file_meta" | 32 GB | 29 GB | 61 GB + "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + Size: 466.91G + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 161944425 | 204,402,677,360,189 + (1 row) + + # 161.9 mil; 204 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 161691608 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1015337 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96537611 | 116281981 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------+----------- + application/pdf | 108706978 + warc/revisit | 5912013 + text/xml | 519042 + application/octet-stream | 307782 + text/html | 295634 + unk | 156937 + application/postscript | 81079 + application/save | 80871 + binary/octet-stream | 61263 + text/plain | 31495 + application/x-download | 30511 + application/download | 26716 + image/pdf | 26357 + application/force-download | 10541 + multipart/form-data | 5551 + application/x-msdownload | 3724 + application/x-octetstream | 3216 + application | 3171 + .pdf | 2728 + application/x-pdf | 2563 + application/binary | 1306 + application/pdf' | 1192 + pdf | 1180 + [...] + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + + total_files | unique_releases + -------------+----------------- + 95557413 | 18020570 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + + status_code | count + -------------+---------- + 200 | 88450610 + 500 | 7101098 + -4 | 4133 + 503 | 110 + + SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10; + + status | count + ----------------+---------- + success | 73814297 + | 14638412 + error | 7101308 + error-timeout | 4133 + bad-grobid-xml | 6 + (5 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 73813427 + | 14638425 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 16589669 + pdf | pmc | 2231113 + pdf | arxiv | 794693 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 148 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8320832 + pdf | doi | fatcat-changelog | 4752956 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630750 + pdf | pmc | fatcat-ingest | 194781 + pdf | arxiv | fatcat-changelog | 163924 + pdf | pmc | fatcat-changelog | 7507 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 148 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 19 + pdf | arxiv | savepapernow-web | 2 + +Uncrawled requests by source: + + # TODO: verify this? seems wrong + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | mag | 4097008 + pdf | oai | 15287 + pdf | unpaywall | 1 + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 5346057 | 0.104 + pdf | mag | 35015357 | 22199271 | 0.634 + pdf | unpaywall | 27653003 | 22067338 | 0.798 + pdf | doi | 16589700 | 3207661 | 0.193 + pdf | pmc | 2231113 | 1696976 | 0.761 + pdf | arxiv | 794727 | 645607 | 0.812 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 148 | 114 | 0.770 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 46465271 + pdf | no-capture | 46115869 + pdf | no-pdf-link | 13877460 + pdf | redirect-loop | 5943956 + pdf | terminal-bad-status | 1962754 + pdf | link-loop | 1630078 + pdf | cdx-error | 1014409 + pdf | gateway-timeout | 459340 + pdf | wrong-mimetype | 321774 + pdf | skip-url-blocklist | 220629 + pdf | wayback-error | 220453 + pdf | spn2-cdx-lookup-failure | 143963 + pdf | null-body | 113384 + pdf | spn-error | 101773 + pdf | invalid-host-resolution | 37367 + pdf | spn-remote-error | 28886 + pdf | petabox-error | 22997 + pdf | spn2-error | 16342 + pdf | spn2-error:job-failed | 5017 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 746 + pdf | spn2-error:soft-time-limit-exceeded | 632 + pdf | spn2-error:proxy-error | 437 + pdf | spn2-error:invalid-url-syntax | 417 + pdf | timeout | 417 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + + total_count | release_count + -------------+--------------- + 5862666 | 4728824 + (1 row) + +## PDF Meta + +Total rows: + + SELECT COUNT(*) as total_count FROM pdf_meta; + + + total_count + ------------- + 21961874 + +By status: + + SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC; + + status | count + ----------------+---------- + success | 21788507 + parse-error | 78196 + text-too-large | 60595 + not-pdf | 31679 + error-wayback | 2639 + bad-unicode | 251 + bad-pdf | 6 + empty-blob | 1 + (8 rows) + diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 44 GB | 45 GB | 89 GB + "public"."grobid" | 66 GB | 8127 MB | 74 GB + "public"."ingest_request" | 34 GB | 40 GB | 73 GB + "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."file_meta" | 33 GB | 30 GB | 63 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 167021210 | 221982345333674 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 166765214 + application/octet-stream | 155517 + application/xml | 42170 + text/html | 18708 + text/plain | 15990 + application/gzip | 6491 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62960 + (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 102123051 | 126550160 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ----------------------------+----------- + application/pdf | 116885565 + warc/revisit | 7951816 + text/xml | 519042 + application/octet-stream | 327639 + text/html | 295725 + unk | 172491 + application/postscript | 81095 + application/save | 80900 + binary/octet-stream | 61783 + text/plain | 33684 + image/pdf | 32856 + application/x-download | 32418 + application/download | 27672 + application/force-download | 10892 + multipart/form-data | 5750 + application/x-msdownload | 3832 + application/x-octetstream | 3516 + application | 3499 + .pdf | 3038 + application/x-pdf | 2701 + application/binary | 1322 + pdf | 1232 + file/unknown | 1199 + application/pdf' | 1192 + file | 979 + (25 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 101494314 | 18919012 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 93730358 + 500 | 7759103 + -4 | 4683 + 503 | 150 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 80838234 + | 12892145 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 17362763 + pdf | pmc | 2248854 + pdf | arxiv | 835400 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 197 + (9 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8399261 + pdf | doi | fatcat-changelog | 5449349 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 634665 + pdf | pmc | fatcat-ingest | 210453 + pdf | arxiv | fatcat-changelog | 200707 + pdf | pmc | fatcat-changelog | 9582 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 197 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 21 + pdf | arxiv | savepapernow-web | 2 + (17 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 170304 + pdf | oai | 15287 + pdf | unpaywall | 1 + (3 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14144314 | 0.276 + pdf | mag | 35015357 | 24811947 | 0.709 + pdf | unpaywall | 27653003 | 22302629 | 0.807 + pdf | doi | 17363369 | 3533568 | 0.204 + pdf | pmc | 2248860 | 1713197 | 0.762 + pdf | arxiv | 835400 | 685219 | 0.820 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 197 | 138 | 0.701 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 58265365 + pdf | no-pdf-link | 27216435 + pdf | no-capture | 21982611 + pdf | redirect-loop | 8457469 + pdf | terminal-bad-status | 2695023 + pdf | link-loop | 2209672 + pdf | wrong-mimetype | 767508 + pdf | gateway-timeout | 548870 + pdf | cdx-error | 391611 + pdf | skip-url-blocklist | 220661 + pdf | null-body | 182215 + pdf | wayback-error | 146869 + pdf | spn2-cdx-lookup-failure | 107229 + pdf | spn-error | 85128 + pdf | invalid-host-resolution | 37352 + pdf | petabox-error | 32490 + pdf | spn2-error | 29212 + pdf | spn-remote-error | 27927 + pdf | other-mimetype | 2305 + pdf | bad-redirect | 1524 + pdf | spn2-error:job-failed | 1521 + pdf | timeout | 842 + pdf | spn2-error:soft-time-limit-exceeded | 793 + pdf | redirects-exceeded | 748 + pdf | spn2-error:invalid-url-syntax | 417 + (25 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) + FROM ingest_file_result + WHERE hit = false + GROUP BY ingest_type, terminal_status_code + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 34064937 + pdf | | 20514531 + pdf | 301 | 7271700 + pdf | 302 | 720632 + pdf | 503 | 712697 + pdf | 400 | 444209 + pdf | 404 | 331495 + pdf | 403 | 323030 + pdf | 401 | 259327 + pdf | 500 | 236122 + pdf | 303 | 101609 + pdf | 429 | 47738 + pdf | 502 | 36183 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14790 + pdf | 999 | 8996 + pdf | 307 | 3769 + pdf | 308 | 3422 + pdf | 202 | 3228 + pdf | 520 | 2058 + pdf | 410 | 1734 + pdf | 521 | 1033 + pdf | 504 | 868 + pdf | 505 | 424 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 6600758 | 5213294 + (1 row) + diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt new file mode 100644 index 0000000..6811b54 --- /dev/null +++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt @@ -0,0 +1,345 @@ + domain | status | count +---------------------------------------+-------------------------+-------- + academic.oup.com | | 4105 + academic.oup.com | spn2-wayback-error | 1393 + academic.oup.com | link-loop | 1025 + academic.oup.com | no-pdf-link | 1020 + academic.oup.com | spn2-cdx-lookup-failure | 512 + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + acp.copernicus.org | | 620 + acp.copernicus.org | success | 537 + aip.scitation.org | | 1310 + aip.scitation.org | blocked-cookie | 1192 + alustath.uobaghdad.edu.iq | | 697 + alustath.uobaghdad.edu.iq | success | 550 + apex.ipk-gatersleben.de | | 1253 + apex.ipk-gatersleben.de | no-pdf-link | 1132 + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + arxiv.org | | 14990 + arxiv.org | success | 12899 + arxiv.org | spn2-wayback-error | 1592 + ashpublications.org | | 563 + asmedigitalcollection.asme.org | | 3990 + asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570 + asmedigitalcollection.asme.org | no-pdf-link | 1449 + asmedigitalcollection.asme.org | link-loop | 734 + assets.researchsquare.com | | 8217 + assets.researchsquare.com | success | 7116 + assets.researchsquare.com | spn2-wayback-error | 946 + av.tib.eu | | 526 + bioone.org | | 588 + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + boris.unibe.ch | | 1420 + boris.unibe.ch | success | 743 + brill.com | | 1773 + brill.com | link-loop | 879 + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + classiques-garnier.com | | 1072 + classiques-garnier.com | success | 807 + content.iospress.com | | 793 + content.iospress.com | link-loop | 568 + cyberdoi.ru | | 775 + cyberdoi.ru | redirect-loop | 775 + cyberleninka.ru | | 1453 + cyberleninka.ru | success | 1092 + d197for5662m48.cloudfront.net | | 632 + d197for5662m48.cloudfront.net | success | 544 + dergipark.org.tr | | 3070 + dergipark.org.tr | success | 1251 + dergipark.org.tr | no-pdf-link | 843 + dergipark.org.tr | spn2-wayback-error | 677 + digi.ub.uni-heidelberg.de | | 502 + dione.lib.unipi.gr | | 783 + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + dl.acm.org | | 1692 + dl.acm.org | blocked-cookie | 1558 + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + doi.nrct.go.th | | 566 + doi.org | | 10408 + doi.org | spn2-cdx-lookup-failure | 9593 + doi.org | terminal-bad-status | 741 + downloads.hindawi.com | | 2137 + downloads.hindawi.com | success | 1787 + dram.journals.ekb.eg | | 541 + elib.spbstu.ru | | 1243 + elib.spbstu.ru | redirect-loop | 1214 + elibrary.vdi-verlag.de | | 1542 + elibrary.vdi-verlag.de | spn2-wayback-error | 721 + elifesciences.org | | 689 + elifesciences.org | success | 521 + epos.myesr.org | | 705 + epos.myesr.org | spn2-wayback-error | 604 + europepmc.org | | 6996 + europepmc.org | success | 6031 + europepmc.org | spn2-wayback-error | 756 + figshare.com | | 1168 + figshare.com | no-pdf-link | 726 + files.osf.io | | 1526 + files.osf.io | success | 1078 + fjfsdata01prod.blob.core.windows.net | | 5410 + fjfsdata01prod.blob.core.windows.net | success | 4581 + fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587 + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + hcommons.org | | 1593 + hcommons.org | success | 1333 + hkvalidate.perfdrive.com | | 1322 + hkvalidate.perfdrive.com | no-pdf-link | 1083 + ieeexplore.ieee.org | | 20997 + ieeexplore.ieee.org | too-many-redirects | 15383 + ieeexplore.ieee.org | spn2-wayback-error | 2555 + ieeexplore.ieee.org | success | 2165 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747 + jamanetwork.com | | 712 + journals.aps.org | | 1698 + journals.aps.org | not-found | 1469 + journals.library.ualberta.ca | | 733 + journals.library.ualberta.ca | success | 594 + journals.lww.com | | 6606 + journals.lww.com | link-loop | 3102 + journals.lww.com | spn2-wayback-error | 1645 + journals.lww.com | terminal-bad-status | 965 + journals.lww.com | spn2-cdx-lookup-failure | 552 + journals.openedition.org | | 4594 + journals.openedition.org | success | 1441 + journals.openedition.org | redirect-loop | 1316 + journals.openedition.org | spn2-wayback-error | 1197 + journals.ub.uni-heidelberg.de | | 1039 + journals.ub.uni-heidelberg.de | success | 728 + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + library.iated.org | | 1560 + library.iated.org | redirect-loop | 1148 + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + mr.crossref.org | | 542 + nsuworks.nova.edu | | 843 + nsuworks.nova.edu | success | 746 + ojs.cvut.cz | | 805 + ojs.cvut.cz | success | 764 + ojs.ugent.be | | 867 + ojs.ugent.be | success | 643 + onepetro.org | | 603 + onlinelibrary.wiley.com | | 1203 + onlinelibrary.wiley.com | blocked-cookie | 758 + open.library.ubc.ca | | 559 + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + oxford.universitypressscholarship.com | | 3556 + oxford.universitypressscholarship.com | link-loop | 2373 + oxford.universitypressscholarship.com | spn2-wayback-error | 562 + painphysicianjournal.com | | 804 + painphysicianjournal.com | success | 668 + papers.ssrn.com | | 6367 + papers.ssrn.com | link-loop | 3865 + papers.ssrn.com | spn2-wayback-error | 1106 + papers.ssrn.com | spn2-cdx-lookup-failure | 1015 + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + pos.sissa.it | | 1455 + pos.sissa.it | success | 1153 + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + publikationen.uni-tuebingen.de | | 659 + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + pubs.acs.org | | 5860 + pubs.acs.org | blocked-cookie | 5185 + pubs.rsc.org | | 2269 + pubs.rsc.org | link-loop | 1384 + res.mdpi.com | | 15776 + res.mdpi.com | success | 13710 + res.mdpi.com | spn2-wayback-error | 1424 + res.mdpi.com | spn2-cdx-lookup-failure | 641 + rrs.scholasticahq.com | | 1078 + rrs.scholasticahq.com | success | 803 + rsdjournal.org | | 755 + rsdjournal.org | success | 524 + s3-eu-west-1.amazonaws.com | | 3343 + s3-eu-west-1.amazonaws.com | success | 2893 + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + sage.figshare.com | | 725 + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + scholarworks.umass.edu | | 1196 + scholarworks.umass.edu | success | 713 + secure.jbs.elsevierhealth.com | | 4202 + secure.jbs.elsevierhealth.com | blocked-cookie | 4169 + storage.googleapis.com | | 1720 + storage.googleapis.com | success | 1466 + tandf.figshare.com | | 789 + tandf.figshare.com | no-pdf-link | 640 + tind-customer-agecon.s3.amazonaws.com | | 584 + turcomat.org | | 1196 + turcomat.org | spn2-wayback-error | 997 + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + utpjournals.press | | 669 + utpjournals.press | blocked-cookie | 616 + watermark.silverchair.com | | 3560 + watermark.silverchair.com | success | 2788 + watermark.silverchair.com | spn2-wayback-error | 685 + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + www.ahajournals.org | | 802 + www.ahajournals.org | blocked-cookie | 597 + www.ajol.info | | 830 + www.ajol.info | success | 575 + www.ams.org | | 868 + www.ams.org | terminal-bad-status | 666 + www.atlantis-press.com | | 1579 + www.atlantis-press.com | success | 1071 + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + www.brazilianjournals.com | | 1385 + www.brazilianjournals.com | success | 1107 + www.cairn.info | | 2479 + www.cairn.info | no-pdf-link | 818 + www.cairn.info | link-loop | 790 + www.cambridge.org | | 6801 + www.cambridge.org | no-pdf-link | 2990 + www.cambridge.org | spn2-wayback-error | 1475 + www.cambridge.org | link-loop | 940 + www.cambridge.org | success | 863 + www.cureus.com | | 538 + www.dbpia.co.kr | | 2958 + www.dbpia.co.kr | redirect-loop | 2953 + www.degruyter.com | | 58612 + www.degruyter.com | no-pdf-link | 41065 + www.degruyter.com | spn2-wayback-error | 7426 + www.degruyter.com | success | 6628 + www.degruyter.com | spn2-cdx-lookup-failure | 1624 + www.degruyter.com | terminal-bad-status | 1565 + www.dovepress.com | | 869 + www.dovepress.com | success | 597 + www.e-manuscripta.ch | | 1047 + www.e3s-conferences.org | | 817 + www.e3s-conferences.org | success | 606 + www.elgaronline.com | | 535 + www.elibrary.ru | | 1244 + www.elibrary.ru | no-pdf-link | 1159 + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + www.eurekaselect.com | | 540 + www.eurosurveillance.org | | 786 + www.eurosurveillance.org | success | 710 + www.finersistemas.com | | 1220 + www.finersistemas.com | success | 1214 + www.frontiersin.org | | 915 + www.frontiersin.org | spn2-wayback-error | 602 + www.hanspub.org | | 618 + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + www.ijcmas.com | | 513 + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + www.indianjournals.com | | 904 + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + www.journals.uchicago.edu | | 6055 + www.journals.uchicago.edu | blocked-cookie | 5927 + www.journals.vu.lt | | 791 + www.journals.vu.lt | success | 545 + www.jstage.jst.go.jp | | 1490 + www.jstage.jst.go.jp | remote-server-error | 1023 + www.jstor.org | | 1103 + www.jstor.org | redirect-loop | 553 + www.karger.com | | 733 + www.liebertpub.com | | 804 + www.liebertpub.com | blocked-cookie | 714 + www.liverpooluniversitypress.co.uk | | 620 + www.liverpooluniversitypress.co.uk | too-many-redirects | 529 + www.mdpi.com | | 3880 + www.mdpi.com | spn2-wayback-error | 1651 + www.mdpi.com | forbidden | 1282 + www.mdpi.com | spn2-cdx-lookup-failure | 714 + www.nepjol.info | | 596 + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + www.osti.gov | | 1147 + www.osti.gov | link-loop | 902 + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + www.preprints.org | | 783 + www.preprints.org | success | 595 + www.repository.cam.ac.uk | | 1146 + www.research-collection.ethz.ch | | 704 + www.research-collection.ethz.ch | terminal-bad-status | 684 + www.researchsquare.com | | 853 + www.researchsquare.com | spn2-wayback-error | 515 + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + www.scielo.br | | 1777 + www.scielo.br | success | 1167 + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + www.scilook.eu | | 812 + www.scilook.eu | success | 563 + www.scirp.org | | 749 + www.tandfonline.com | | 11038 + www.tandfonline.com | blocked-cookie | 9994 + www.tandfonline.com | no-pdf-link | 663 + www.taylorfrancis.com | | 71514 + www.taylorfrancis.com | spn2-wayback-error | 36663 + www.taylorfrancis.com | no-pdf-link | 15098 + www.taylorfrancis.com | forbidden | 8699 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894 + www.taylorfrancis.com | link-loop | 3661 + www.thieme-connect.de | | 3687 + www.thieme-connect.de | redirect-loop | 1187 + www.thieme-connect.de | not-found | 945 + www.thieme-connect.de | no-pdf-link | 941 + www.worldscientific.com | | 1476 + www.worldscientific.com | blocked-cookie | 1323 + www.zora.uzh.ch | | 1118 + zenodo.org | | 43010 + zenodo.org | no-pdf-link | 22015 + zenodo.org | success | 12747 + zenodo.org | spn2-wayback-error | 4608 + zenodo.org | spn2-cdx-lookup-failure | 3215 + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 +(341 rows) + diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt new file mode 100644 index 0000000..57f7e57 --- /dev/null +++ b/sql/stats/2021-11-01_table_sizes.txt @@ -0,0 +1,19 @@ + +Size: 832.66G + + table_name | table_size | indexes_size | total_size +-------------------------------+------------+--------------+------------ + "public"."crossref" | 311 GB | 9812 MB | 320 GB + "public"."ingest_request" | 44 GB | 40 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6952 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 54 GB + "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB +(13 rows) diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt new file mode 100644 index 0000000..3a0e561 --- /dev/null +++ b/sql/stats/2021-11-26_stats.txt @@ -0,0 +1,424 @@ + +Date: Sat 27 Nov 2021 03:33:30 AM UTC + +## SQL Table Sizes + + Size: 937.28G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 393 GB | 10127 MB | 403 GB + "public"."ingest_request" | 44 GB | 41 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6963 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 179761501 | 244453538203113 + + # 179m files, 244 TB + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 179376819 + application/octet-stream | 155379 + text/html | 116102 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 794 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303 + application/x-compress | 272 + application/zip | 131 + image/png | 121 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + video/mp4 | 29 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62196 + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 119049962 | 149169240 + + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 137271670 + warc/revisit | 9709493 + application/octet-stream | 590443 + text/xml | 525481 + text/html | 421030 + unk | 207442 + application/postscript | 81123 + application/save | 80988 + binary/octet-stream | 67476 + image/pdf | 39419 + application/x-download | 38278 + text/plain | 36159 + application/download | 34328 + application/force-download | 19729 + multipart/form-data | 9105 + application | 5299 + application/x-msdownload | 3851 + application/x-octetstream | 3649 + .pdf | 3318 + application/x-pdf | 2992 + pdf | 1484 + file | 1364 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1029 + application/unknown | 1000 + 0 | 764 + text/pdf | 704 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 111236904 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 102962304 + 500 | 8269129 + -4 | 5013 + 503 | 548 + +TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF +files through by mistake + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.5.5-fatcat | 89983404 + | 12892161 + 0.7.0-104-gbeebd9a6b | 86739 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 43701948 + pdf | unpaywall | 37802895 + pdf | doi | 28736398 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2383398 + pdf | arxiv | 2330054 + html | doi | 39725 + xml | doaj | 9442 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 689 + html | spn | 48 + xml | spn | 1 + (15 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 43701948 + pdf | unpaywall | unpaywall | 37802895 + pdf | doi | fatcat-changelog | 16207728 + pdf | doi | fatcat-ingest | 9012282 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767705 + pdf | arxiv | fatcat-changelog | 562320 + pdf | pmc | fatcat-ingest | 297527 + pdf | pmc | fatcat-changelog | 57046 + html | doi | fatcat-ingest | 37788 + xml | doaj | doaj | 9442 + pdf | cnki_covid19 | scrape-covid19 | 2034 + html | doi | fatcat-changelog | 1897 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 689 + pdf | doi | savepapernow-web | 613 + html | spn | savepapernow-web | 48 + html | doi | savepapernow-web | 40 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | spn | savepapernow-web | 1 + (25 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 169076 + pdf | oai | 15283 + pdf | doaj | 2063 + html | doaj | 620 + pdf | doi | 22 + pdf | unpaywall | 17 + + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14554221 | 0.284 + pdf | mag | 43701948 | 32643175 | 0.747 + pdf | unpaywall | 37802895 | 29989257 | 0.793 + pdf | doi | 28736547 | 7690393 | 0.268 + pdf | doaj | 4264610 | 2851601 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2383398 | 1821071 | 0.764 + pdf | arxiv | 2330054 | 2159738 | 0.927 + html | doi | 39725 | 1235 | 0.031 + xml | doaj | 9442 | 6897 | 0.730 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 689 | 503 | 0.730 + html | spn | 48 | 5 | 0.104 + xml | spn | 1 | 0 | 0.000 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 78944243 + pdf | no-pdf-link | 26270027 + pdf | no-capture | 23267156 + pdf | redirect-loop | 9837466 + pdf | terminal-bad-status | 4147454 + pdf | skip-url-blocklist | 3088907 + pdf | link-loop | 2953891 + pdf | blocked-cookie | 1855541 + html | wrong-scope | 1106171 + pdf | wrong-mimetype | 859941 + pdf | gateway-timeout | 729771 + pdf | spn2-cdx-lookup-failure | 584856 + html | no-capture | 423917 + pdf | forbidden | 390804 + pdf | cdx-error | 363091 + pdf | wayback-content-error | 354894 + pdf | null-body | 341698 + pdf | too-many-redirects | 307096 + pdf | not-found | 294592 + html | redirect-loop | 213032 + html | unknown-scope | 207923 + pdf | spn2-error | 192046 + html | html-resource-no-capture | 166119 + html | success | 124177 + pdf | wayback-error | 105385 + html | null-body | 100296 + pdf | spn2-wayback-error | 73176 + pdf | remote-server-error | 60908 + pdf | spn2-error:too-many-redirects | 58076 + pdf | skip-wall | 57744 + html | wayback-content-error | 53928 + pdf | read-timeout | 42465 + pdf | invalid-host-resolution | 37221 + pdf | petabox-error | 28765 + pdf | spn2-error:unknown | 23885 + html | wrong-mimetype | 18930 + pdf | bad-redirect | 14708 + html | terminal-bad-status | 14070 + html | petabox-error | 13770 + html | spn2-cdx-lookup-failure | 13002 + pdf | spn2-error:job-failed | 9721 + html | cdx-error | 7167 + xml | success | 6897 + pdf | spn2-error:bad-request | 4433 + pdf | spn-remote-error | 4206 + pdf | body-too-large | 3019 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1900 + pdf | spn2-error:proxy-error | 1850 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36821458 + pdf | | 26058729 + pdf | 301 | 8466302 + html | 200 | 1676730 + pdf | 503 | 1028504 + pdf | 302 | 949465 + pdf | 403 | 936737 + pdf | 404 | 687661 + pdf | 400 | 507303 + html | | 439356 + pdf | 401 | 288994 + pdf | 500 | 263775 + html | 301 | 211796 + pdf | 303 | 130719 + pdf | 410 | 66495 + pdf | 502 | 41760 + pdf | 429 | 35266 + pdf | 420 | 26722 + pdf | 409 | 15204 + pdf | 509 | 15113 + pdf | 999 | 11409 + html | 404 | 9578 + pdf | 307 | 8404 + pdf | 308 | 5514 + pdf | 202 | 4724 + html | 500 | 3628 + xml | 200 | 2537 + pdf | 520 | 2199 + pdf | 206 | 1694 + html | 302 | 1138 + pdf | 504 | 1124 + pdf | 521 | 1085 + pdf | 412 | 921 + pdf | 421 | 714 + pdf | 300 | 461 + pdf | 505 | 436 + pdf | 406 | 427 + pdf | 508 | 408 + html | 403 | 382 + html | 503 | 378 + html | 303 | 268 + pdf | 204 | 252 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 523 | 55 + pdf | 408 | 53 + pdf | 432 | 45 + pdf | 530 | 31 + pdf | 416 | 31 + (50 rows) diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt new file mode 100644 index 0000000..b03c370 --- /dev/null +++ b/sql/stats/2021-12-02_table_sizes.txt @@ -0,0 +1,22 @@ + +Size: 940.66G + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 394 GB | 10138 MB | 404 GB + "public"."ingest_request" | 44 GB | 41 GB | 85 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6978 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes +(16 rows) diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt new file mode 100644 index 0000000..bd20c5c --- /dev/null +++ b/sql/stats/2022-04-26_stats.txt @@ -0,0 +1,432 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 416 GB | 10 GB | 426 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 58 GB | 41 GB | 99 GB + "public"."ingest_request" | 50 GB | 48 GB | 98 GB + "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 37 GB | 34 GB | 71 GB + "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB + "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 3015 MB | 31 MB | 3046 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 192402128 | 271919997557597 + (1 row) + + # 271,919,997,557,597 -> ~272 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 191760695 + text/html | 330351 + application/octet-stream | 186696 + application/xml | 42170 + application/xhtml+xml | 31470 + text/plain | 16449 + application/jats+xml | 6902 + application/gzip | 6681 + | 6033 + application/postscript | 4916 + image/jpeg | 2901 + application/vnd.ms-powerpoint | 1672 + application/msword | 934 + application/x-bzip2 | 891 + image/png | 476 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294 + application/x-compress | 274 + video/mp4 | 150 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12831 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 130732381 | 162760251 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 149749828 + warc/revisit | 10437210 + application/octet-stream | 733161 + text/html | 642992 + text/xml | 525483 + unk | 217642 + application/postscript | 81127 + application/save | 81023 + binary/octet-stream | 67938 + application/x-download | 41137 + image/pdf | 39712 + application/download | 37153 + text/plain | 36342 + application/force-download | 21496 + multipart/form-data | 9792 + application | 5366 + application/x-octetstream | 5166 + application/x-msdownload | 3851 + .pdf | 3445 + application/x-pdf | 3018 + pdf | 1618 + file | 1370 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1047 + application/unknown | 1001 + 0 | 773 + text/pdf | 729 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 123669603 + (1 row) + + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 115668412 + 500 | 7995428 + -4 | 5745 + 503 | 18 + (4 rows) + + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 54780825 + 0.5.5-fatcat | 48003940 + | 12694404 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | mag | 43701948 + pdf | doi | 40044585 + pdf | doaj | 6016771 + html | doaj | 3648181 + pdf | arxiv | 2676200 + pdf | pmc | 2402453 + html | doi | 41492 + xml | doaj | 20638 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 829 + html | spn | 52 + xml | doi | 1 + xml | spn | 1 + (16 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 20936949 + pdf | doi | fatcat-ingest | 15590201 + pdf | doaj | doaj | 6016771 + html | doaj | doaj | 3648181 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 691405 + pdf | pmc | fatcat-ingest | 297646 + pdf | pmc | fatcat-changelog | 75982 + html | doi | fatcat-ingest | 37904 + xml | doaj | doaj | 20638 + html | doi | fatcat-changelog | 3534 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1562 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 829 + html | doi | savepapernow-web | 54 + html | spn | savepapernow-web | 52 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | doi | savepapernow-web | 1 + xml | spn | savepapernow-web | 1 + (26 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doaj | 1619621 + html | doaj | 1208412 + pdf | mag | 167653 + pdf | oai | 15282 + xml | doaj | 11196 + pdf | unpaywall | 270 + pdf | doi | 22 + (7 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 15968290 | 0.312 + pdf | unpaywall | 43932525 | 32618045 | 0.742 + pdf | mag | 43701948 | 32662926 | 0.747 + pdf | doi | 40044738 | 10925369 | 0.273 + pdf | doaj | 6016771 | 3042569 | 0.506 + html | doaj | 3648181 | 344208 | 0.094 + pdf | arxiv | 2676206 | 2269708 | 0.848 + pdf | pmc | 2402453 | 1855679 | 0.772 + html | doi | 41492 | 1739 | 0.042 + xml | doaj | 20638 | 6899 | 0.334 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 829 | 616 | 0.743 + html | spn | 52 | 7 | 0.135 + xml | doi | 1 | 0 | 0.000 + xml | spn | 1 | 0 | 0.000 + (16 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+---------------------------------+---------- + pdf | success | 85709322 + pdf | no-pdf-link | 29713304 + pdf | no-capture | 26632191 + pdf | redirect-loop | 10979145 + pdf | terminal-bad-status | 4977000 + pdf | link-loop | 3434877 + pdf | skip-url-blocklist | 3114258 + pdf | blocked-cookie | 2156835 + html | wrong-scope | 1126911 + pdf | wrong-mimetype | 980546 + pdf | gateway-timeout | 651562 + pdf | spn2-cdx-lookup-failure | 484016 + pdf | spn2-backoff | 399382 + pdf | cdx-error | 373964 + pdf | wayback-content-error | 354370 + html | success | 345860 + pdf | null-body | 336182 + pdf | spn2-error:500 | 309755 + pdf | forbidden | 291175 + pdf | not-found | 275560 + pdf | too-many-redirects | 262312 + html | unknown-scope | 230352 + html | redirect-loop | 226596 + html | html-resource-no-capture | 205646 + html | no-capture | 164014 + component | spn2-cdx-lookup-failure | 148825 + component | wrong-mimetype | 130344 + html | null-body | 100296 + pdf | wayback-error | 94286 + pdf | spn2-wayback-error | 81365 + component | no-capture | 75278 + pdf | spn2-error | 69830 + pdf | skip-wall | 57744 + pdf | spn2-error:too-many-redirects | 53808 + pdf | remote-server-error | 41286 + pdf | petabox-error | 38800 + pdf | invalid-host-resolution | 37337 + pdf | read-timeout | 36872 + component | spn2-backoff | 33217 + pdf | empty-blob | 27946 + component | spn2-error | 24078 + pdf | spn2-error:unknown | 23697 + component | gateway-timeout | 23139 + html | wrong-mimetype | 22731 + html | wayback-content-error | 20507 + pdf | spn2-error:host-crawling-paused | 19900 + pdf | bad-redirect | 19183 + html | terminal-bad-status | 13354 + component | blocked-cookie | 12287 + component | spn2-error:500 | 11271 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 38144779 + pdf | | 32762240 + pdf | 301 | 9433087 + html | 200 | 1716127 + pdf | 403 | 1416632 + pdf | 302 | 1134668 + pdf | 404 | 888853 + pdf | 401 | 746311 + pdf | 503 | 655894 + pdf | 400 | 531479 + component | | 337603 + pdf | 500 | 247944 + html | 301 | 224237 + html | | 167194 + pdf | 303 | 135048 + component | 200 | 130663 + pdf | 429 | 93489 + pdf | 410 | 67392 + pdf | 420 | 26722 + pdf | 502 | 18770 + pdf | 409 | 15152 + pdf | 509 | 15113 + pdf | 999 | 11747 + html | 404 | 9879 + pdf | 307 | 8895 + pdf | 412 | 7053 + pdf | 308 | 6627 + pdf | 202 | 5289 + xml | 200 | 2540 + html | 500 | 2480 + pdf | 520 | 2220 + pdf | 521 | 1844 + pdf | 206 | 1739 + html | 302 | 1407 + pdf | 504 | 1146 + html | 303 | 1123 + pdf | 421 | 986 + pdf | 406 | 938 + pdf | 204 | 498 + pdf | 505 | 468 + pdf | 300 | 436 + pdf | 508 | 422 + pdf | 426 | 405 + html | 429 | 402 + html | 403 | 398 + pdf | 432 | 366 + component | 301 | 294 + pdf | 405 | 210 + pdf | 226 | 166 + component | 302 | 128 + (50 rows) + diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt new file mode 100644 index 0000000..864abd4 --- /dev/null +++ b/sql/stats/2022-04-27_crawl_changelog.txt @@ -0,0 +1,191 @@ + domain | status | count +--------------------------------------+-------------------------+-------- + academic.oup.com | | 1243 + academic.oup.com | spn2-cdx-lookup-failure | 990 + aip.scitation.org | | 313 + aip.scitation.org | spn2-cdx-lookup-failure | 224 + ajps.uomustansiriyah.edu.iq | | 235 + apps.crossref.org | | 1329 + apps.crossref.org | spn2-cdx-lookup-failure | 942 + apps.crossref.org | no-pdf-link | 387 + archaeologydataservice.ac.uk | | 422 + archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289 + arxiv.org | | 3512 + arxiv.org | spn2-cdx-lookup-failure | 2319 + arxiv.org | success | 1177 + assets.researchsquare.com | | 571 + assets.researchsquare.com | spn2-cdx-lookup-failure | 322 + assets.researchsquare.com | success | 249 + brill.com | | 397 + brill.com | spn2-cdx-lookup-failure | 265 + cla.berkeley.edu | | 239 + classiques-garnier.com | | 249 + cyberleninka.ru | | 340 + cyberleninka.ru | spn2-cdx-lookup-failure | 244 + dergipark.org.tr | | 468 + dergipark.org.tr | spn2-cdx-lookup-failure | 333 + dl.acm.org | | 592 + dl.acm.org | spn2-cdx-lookup-failure | 470 + doi.ala.org.au | | 288 + doi.ala.org.au | spn2-cdx-lookup-failure | 220 + doi.org | | 1107 + doi.org | terminal-bad-status | 679 + doi.org | spn2-cdx-lookup-failure | 415 + downloads.hindawi.com | | 279 + downloads.hindawi.com | success | 267 + edbs.uomustansiriyah.edu.iq | | 294 + edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209 + elibrary.kdpu.edu.ua | | 320 + elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233 + elibrary.ru | | 722 + elibrary.ru | spn2-cdx-lookup-failure | 505 + europepmc.org | | 986 + europepmc.org | spn2-cdx-lookup-failure | 681 + europepmc.org | success | 291 + figshare.com | | 377 + figshare.com | spn2-cdx-lookup-failure | 328 + fjfsdata01prod.blob.core.windows.net | | 255 + fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216 + hammer.purdue.edu | | 224 + ieeexplore.ieee.org | | 3904 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654 + ieeexplore.ieee.org | gateway-timeout | 792 + ieeexplore.ieee.org | spn2-backoff | 419 + journals.eco-vector.com | | 428 + journals.eco-vector.com | spn2-cdx-lookup-failure | 306 + journals.lww.com | | 727 + journals.lww.com | spn2-cdx-lookup-failure | 622 + journals.openedition.org | | 806 + journals.openedition.org | spn2-cdx-lookup-failure | 554 + journals.plos.org | | 348 + journals.plos.org | spn2-cdx-lookup-failure | 244 + kiss.kstudy.com | | 226 + kluwerlawonline.com | | 723 + kluwerlawonline.com | spn2-cdx-lookup-failure | 489 + kluwerlawonline.com | link-loop | 203 + linkinghub.elsevier.com | | 401 + linkinghub.elsevier.com | spn2-backoff | 342 + mdpi-res.com | | 1463 + mdpi-res.com | success | 1337 + muse.jhu.edu | | 346 + muse.jhu.edu | spn2-cdx-lookup-failure | 253 + onepetro.org | | 363 + onepetro.org | spn2-cdx-lookup-failure | 284 + online.ucpress.edu | | 1620 + online.ucpress.edu | spn2-cdx-lookup-failure | 1511 + onlinelibrary.wiley.com | | 2913 + onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109 + onlinelibrary.wiley.com | terminal-bad-status | 787 + opendata.uni-halle.de | | 519 + opendata.uni-halle.de | spn2-cdx-lookup-failure | 343 + osf.io | | 1554 + osf.io | spn2-cdx-lookup-failure | 1350 + papers.ssrn.com | | 2207 + papers.ssrn.com | spn2-cdx-lookup-failure | 1727 + papers.ssrn.com | link-loop | 457 + psycharchives.org | | 384 + psycharchives.org | spn2-cdx-lookup-failure | 283 + publons.com | | 493 + publons.com | spn2-cdx-lookup-failure | 348 + pubs.acs.org | | 1240 + pubs.acs.org | spn2-cdx-lookup-failure | 881 + pubs.acs.org | terminal-bad-status | 298 + pubs.rsc.org | | 603 + pubs.rsc.org | spn2-cdx-lookup-failure | 460 + repositories.lib.utexas.edu | | 1861 + repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288 + repositories.lib.utexas.edu | terminal-bad-status | 523 + s3-eu-west-1.amazonaws.com | | 216 + sage.figshare.com | | 374 + sage.figshare.com | spn2-cdx-lookup-failure | 309 + scholar.dkyobobook.co.kr | | 220 + scholarworks.gsu.edu | | 749 + scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577 + tandf.figshare.com | | 214 + www.atlantis-press.com | | 338 + www.atlantis-press.com | spn2-cdx-lookup-failure | 214 + www.cairn.info | | 782 + www.cairn.info | spn2-cdx-lookup-failure | 541 + www.cambridge.org | | 2325 + www.cambridge.org | spn2-cdx-lookup-failure | 1787 + www.cambridge.org | no-pdf-link | 300 + www.cell.com | | 213 + www.concrete.org | | 476 + www.concrete.org | spn2-cdx-lookup-failure | 340 + www.dbpia.co.kr | | 375 + www.dbpia.co.kr | spn2-cdx-lookup-failure | 275 + www.degruyter.com | | 3849 + www.degruyter.com | spn2-cdx-lookup-failure | 2969 + www.degruyter.com | no-pdf-link | 712 + www.dib.ie | | 1100 + www.dib.ie | spn2-cdx-lookup-failure | 1038 + www.e-periodica.ch | | 821 + www.e-periodica.ch | spn2-cdx-lookup-failure | 620 + www.e-periodica.ch | no-pdf-link | 201 + www.elibrary.ru | | 401 + www.elibrary.ru | spn2-cdx-lookup-failure | 281 + www.emerald.com | | 390 + www.emerald.com | spn2-cdx-lookup-failure | 275 + www.eurekaselect.com | | 275 + www.frontiersin.org | | 1266 + www.frontiersin.org | spn2-cdx-lookup-failure | 1025 + www.hanspub.org | | 229 + www.hindawi.com | | 604 + www.hindawi.com | spn2-cdx-lookup-failure | 594 + www.inderscience.com | | 201 + www.jstage.jst.go.jp | | 1094 + www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807 + www.jstage.jst.go.jp | success | 206 + www.mdpi.com | | 4340 + www.mdpi.com | spn2-cdx-lookup-failure | 4258 + www.nomos-elibrary.de | | 2749 + www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909 + www.nomos-elibrary.de | redirect-loop | 819 + www.osti.gov | | 275 + www.oxfordhandbooks.com | | 248 + www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224 + www.pdcnet.org | | 217 + www.researchsquare.com | | 483 + www.researchsquare.com | spn2-cdx-lookup-failure | 317 + www.scielo.br | | 319 + www.scielo.br | spn2-cdx-lookup-failure | 222 + www.sciencedirect.com | | 3384 + www.sciencedirect.com | spn2-cdx-lookup-failure | 3267 + www.spiedigitallibrary.org | | 441 + www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327 + www.tandfonline.com | | 2401 + www.tandfonline.com | spn2-cdx-lookup-failure | 1552 + www.tandfonline.com | no-pdf-link | 303 + www.tandfonline.com | blocked-cookie | 250 + www.taylorfrancis.com | | 1232 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 908 + www.thieme-connect.de | | 520 + www.thieme-connect.de | spn2-cdx-lookup-failure | 366 + www.worldscientific.com | | 383 + www.worldscientific.com | spn2-cdx-lookup-failure | 276 + zenodo.org | | 10625 + zenodo.org | spn2-cdx-lookup-failure | 7777 + zenodo.org | success | 1574 + zenodo.org | no-pdf-link | 1160 + zivahub.uct.ac.za | | 3428 + zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845 + zivahub.uct.ac.za | no-pdf-link | 583 + | | 130491 + | spn2-cdx-lookup-failure | 95169 + | success | 13354 + | no-pdf-link | 9621 + | terminal-bad-status | 3385 + | spn2-backoff | 2396 + | redirect-loop | 2216 + | link-loop | 1850 + | gateway-timeout | 1061 + | spn2-error:blocked-url | 428 + | blocked-cookie | 415 + | spn2-error | 246 +(182 rows) + +---- + +The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a +week or two, when crawling and retries are running smoothly, and see what +things look like then. diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt new file mode 100644 index 0000000..8d98217 --- /dev/null +++ b/sql/stats/2022-05-11_crawl_changelog.txt @@ -0,0 +1,410 @@ + domain | status | count +-----------------------------------------------------------------+-------------------------+-------- + academic.oup.com | | 2210 + academic.oup.com | no-pdf-link | 1350 + academic.oup.com | bad-redirect | 510 + academiccommons.columbia.edu | | 379 + academiccommons.columbia.edu | success | 339 + aip.scitation.org | | 762 + aip.scitation.org | terminal-bad-status | 430 + apps.crossref.org | | 9894 + apps.crossref.org | no-pdf-link | 9886 + apps.euskadi.eus | | 242 + apps.euskadi.eus | no-pdf-link | 240 + arxiv.org | | 44889 + arxiv.org | success | 28781 + arxiv.org | spn2-backoff | 7975 + arxiv.org | terminal-bad-status | 4508 + arxiv.org | spn2-cdx-lookup-failure | 2010 + arxiv.org | redirect-loop | 619 + arxiv.org | no-pdf-link | 242 + arxiv.org | spn2-error | 236 + asa.scitation.org | | 356 + asa.scitation.org | terminal-bad-status | 299 + asmedigitalcollection.asme.org | | 240 + assets.cureus.com | | 336 + assets.cureus.com | success | 335 + assets.researchsquare.com | | 1042 + assets.researchsquare.com | success | 993 + av.tib.eu | | 205 + av.tib.eu | no-pdf-link | 203 + bibliographie.uni-tuebingen.de | | 213 + bibliographie.uni-tuebingen.de | no-pdf-link | 211 + biorxiv.org | redirect-loop | 217 + biorxiv.org | | 217 + books.openedition.org | | 691 + books.openedition.org | no-pdf-link | 687 + boris.unibe.ch | | 525 + boris.unibe.ch | success | 466 + bridges.monash.edu | | 663 + bridges.monash.edu | no-pdf-link | 647 + brill.com | | 860 + brill.com | success | 434 + chemrxiv.org | | 201 + classiques-garnier.com | | 242 + content.iospress.com | | 325 + content.iospress.com | link-loop | 247 + core.tdar.org | | 216 + core.tdar.org | no-pdf-link | 211 + cyberleninka.ru | | 646 + cyberleninka.ru | success | 620 + d197for5662m48.cloudfront.net | | 263 + d197for5662m48.cloudfront.net | success | 262 + dergipark.org.tr | | 891 + dergipark.org.tr | success | 526 + dergipark.org.tr | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 427 + digi.ub.uni-heidelberg.de | no-pdf-link | 427 + direct.mit.edu | | 268 + direct.mit.edu | no-pdf-link | 208 + dl.acm.org | | 1719 + dl.acm.org | success | 829 + dl.acm.org | no-pdf-link | 546 + dl.acm.org | terminal-bad-status | 205 + dlc.library.columbia.edu | | 385 + dlc.library.columbia.edu | terminal-bad-status | 319 + doi.ala.org.au | | 724 + doi.ala.org.au | no-pdf-link | 721 + doi.apa.org | | 214 + doi.org | | 3390 + doi.org | terminal-bad-status | 2938 + doi.org | redirect-loop | 233 + doi.org | spn2-wayback-error | 208 + doi.usp.org | | 325 + doi.usp.org | no-pdf-link | 324 + downloads.hindawi.com | | 1439 + downloads.hindawi.com | success | 1436 + du.diva-portal.org | | 589 + du.diva-portal.org | success | 586 + econtents.bc.unicamp.br | | 310 + econtents.bc.unicamp.br | success | 310 + ediss.uni-goettingen.de | | 728 + ediss.uni-goettingen.de | success | 425 + elibrary.kdpu.edu.ua | | 907 + elibrary.kdpu.edu.ua | bad-redirect | 712 + elibrary.ru | | 925 + elibrary.ru | terminal-bad-status | 492 + elibrary.ru | bad-redirect | 230 + elibrary.vdi-verlag.de | | 393 + elifesciences.org | | 296 + elifesciences.org | success | 276 + europepmc.org | | 3024 + europepmc.org | success | 2541 + europepmc.org | terminal-bad-status | 463 + figshare.com | | 493 + figshare.com | no-pdf-link | 440 + files.osf.io | | 883 + files.osf.io | success | 686 + fjfsdata01prod.blob.core.windows.net | | 3869 + fjfsdata01prod.blob.core.windows.net | success | 3818 + ieeexplore.ieee.org | | 10854 + ieeexplore.ieee.org | gateway-timeout | 5495 + ieeexplore.ieee.org | spn2-backoff | 1662 + ieeexplore.ieee.org | no-pdf-link | 1417 + ieeexplore.ieee.org | success | 1410 + ieeexplore.ieee.org | redirect-loop | 768 + iiif.crossasia.org | | 7608 + iiif.crossasia.org | no-pdf-link | 7568 + ikee.lib.auth.gr | | 450 + ikee.lib.auth.gr | success | 332 + ins.journals.ekb.eg | | 212 + iopscience.iop.org | | 268 + jamanetwork.com | | 333 + journals.aps.org | | 414 + journals.asm.org | | 242 + journals.flvc.org | | 245 + journals.flvc.org | success | 242 + journals.healio.com | | 755 + journals.healio.com | terminal-bad-status | 668 + journals.lincoln.ac.nz | | 244 + journals.lincoln.ac.nz | success | 239 + journals.lww.com | | 1772 + journals.lww.com | link-loop | 1425 + journals.lww.com | spn2-backoff | 209 + journals.openedition.org | | 1192 + journals.openedition.org | redirect-loop | 467 + journals.openedition.org | success | 451 + journals.plos.org | | 771 + journals.plos.org | success | 750 + journals.ub.uni-heidelberg.de | | 787 + journals.ub.uni-heidelberg.de | success | 741 + kazanmedjournal.ru | | 240 + kazanmedjournal.ru | success | 231 + kiss.kstudy.com | | 219 + kiss.kstudy.com | no-pdf-link | 218 + kluwerlawonline.com | | 444 + kluwerlawonline.com | link-loop | 402 + libraetd.lib.virginia.edu | | 362 + libraetd.lib.virginia.edu | no-pdf-link | 361 + link.springer.com | | 305 + linkinghub.elsevier.com | | 568 + linkinghub.elsevier.com | spn2-backoff | 545 + ltu-figshare-repo.s3.aarnet.edu.au | | 269 + ltu-figshare-repo.s3.aarnet.edu.au | success | 268 + mausamjournal.imd.gov.in | | 202 + mdpi-res.com | | 8892 + mdpi-res.com | success | 8863 + mededpublish.org | | 1900 + mededpublish.org | no-pdf-link | 1900 + meetingorganizer.copernicus.org | | 276 + meetingorganizer.copernicus.org | no-pdf-link | 271 + muse.jhu.edu | | 1047 + muse.jhu.edu | terminal-bad-status | 755 + muse.jhu.edu | link-loop | 203 + online.ucpress.edu | | 358 + online.ucpress.edu | link-loop | 212 + onlinelibrary.wiley.com | | 5813 + onlinelibrary.wiley.com | terminal-bad-status | 4587 + onlinelibrary.wiley.com | spn2-wayback-error | 614 + onlinelibrary.wiley.com | blocked-cookie | 381 + open.library.ubc.ca | | 206 + opendata.uni-halle.de | | 1768 + opendata.uni-halle.de | success | 1215 + opendata.uni-halle.de | wrong-mimetype | 260 + opendata2.uni-halle.de | | 206 + opg.optica.org | | 205 + osf.io | | 2949 + osf.io | no-pdf-link | 2404 + osf.io | spn2-backoff | 299 + papers.ssrn.com | | 3962 + papers.ssrn.com | link-loop | 3800 + peerj.com | | 273 + preprints.jmir.org | | 275 + preprints.jmir.org | cdx-error | 255 + publikationen.bibliothek.kit.edu | | 213 + publons.com | | 593 + publons.com | no-pdf-link | 590 + pubs.acs.org | | 2288 + pubs.acs.org | terminal-bad-status | 1841 + pubs.acs.org | spn2-wayback-error | 210 + pubs.rsc.org | | 1698 + pubs.rsc.org | bad-redirect | 811 + pubs.rsc.org | link-loop | 352 + pubs.rsc.org | success | 307 + radiopaedia.org | | 220 + read.dukeupress.edu | | 303 + repositories.lib.utexas.edu | | 1570 + repositories.lib.utexas.edu | bad-redirect | 513 + repositories.lib.utexas.edu | spn2-backoff | 383 + repositories.lib.utexas.edu | gateway-timeout | 379 + repositories.lib.utexas.edu | terminal-bad-status | 282 + repository.uj.ac.za | | 489 + repository.uj.ac.za | no-pdf-link | 365 + repository.unsworks.unsw.edu.au | | 397 + repository.urosario.edu.co | | 2429 + repository.urosario.edu.co | success | 1648 + repository.urosario.edu.co | bad-redirect | 613 + rex.libraries.wsu.edu | no-pdf-link | 241 + rex.libraries.wsu.edu | | 241 + rsdjournal.org | | 208 + rsdjournal.org | success | 208 + s3-ap-southeast-2.amazonaws.com | | 282 + s3-ap-southeast-2.amazonaws.com | success | 277 + s3-eu-west-1.amazonaws.com | | 4615 + s3-eu-west-1.amazonaws.com | success | 4593 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237 + sage.figshare.com | | 415 + sage.figshare.com | no-pdf-link | 385 + scholar.dkyobobook.co.kr | | 512 + scholar.dkyobobook.co.kr | no-pdf-link | 509 + scholarlypublishingcollective.org | | 287 + scholarworks.gsu.edu | | 1132 + scholarworks.gsu.edu | success | 1000 + scholarworks.iupui.edu | | 205 + scholarworks.umass.edu | | 417 + scholarworks.umass.edu | success | 400 + sciencescholar.us | | 404 + secure.jbs.elsevierhealth.com | | 727 + secure.jbs.elsevierhealth.com | terminal-bad-status | 722 + tandf.figshare.com | | 354 + tandf.figshare.com | no-pdf-link | 342 + unsworks.unsw.edu.au | | 408 + unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342 + valep.vc.univie.ac.at | no-pdf-link | 737 + valep.vc.univie.ac.at | | 737 + watermark.silverchair.com | | 1604 + watermark.silverchair.com | success | 1598 + wayf.switch.ch | | 215 + wayf.switch.ch | no-pdf-link | 213 + www.ahajournals.org | | 438 + www.ahajournals.org | no-pdf-link | 306 + www.ahbps.org | | 316 + www.ahbps.org | success | 312 + www.atenaeditora.com.br | | 390 + www.atenaeditora.com.br | terminal-bad-status | 333 + www.atlantis-press.com | | 914 + www.atlantis-press.com | success | 901 + www.atsjournals.org | | 1245 + www.atsjournals.org | success | 1189 + www.biorxiv.org | | 712 + www.biorxiv.org | success | 670 + www.bloomsburycollections.com | | 982 + www.bloomsburycollections.com | terminal-bad-status | 566 + www.cahiers-clsl.ch | | 305 + www.cahiers-clsl.ch | success | 298 + www.cairn.info | | 1799 + www.cairn.info | no-pdf-link | 662 + www.cairn.info | link-loop | 487 + www.cairn.info | success | 355 + www.cairn.info | terminal-bad-status | 267 + www.cambridge.org | | 3258 + www.cambridge.org | no-pdf-link | 1682 + www.cambridge.org | success | 682 + www.cambridge.org | bad-redirect | 404 + www.cambridge.org | link-loop | 302 + www.dbpia.co.kr | | 763 + www.dbpia.co.kr | no-pdf-link | 443 + www.dbpia.co.kr | redirect-loop | 287 + www.degruyter.com | | 12655 + www.degruyter.com | no-pdf-link | 9112 + www.degruyter.com | success | 2898 + www.degruyter.com | spn2-backoff | 507 + www.dib.ie | | 1381 + www.dib.ie | no-pdf-link | 1378 + www.dovepress.com | | 231 + www.dovepress.com | success | 216 + www.e-manuscripta.ch | | 767 + www.e-manuscripta.ch | success | 399 + www.e-periodica.ch | | 1406 + www.e-periodica.ch | no-pdf-link | 1402 + www.e-rara.ch | no-pdf-link | 251 + www.e-rara.ch | | 251 + www.editoracientifica.org | no-pdf-link | 205 + www.editoracientifica.org | | 205 + www.elgaronline.com | | 427 + www.elibrary.ru | | 616 + www.elibrary.ru | terminal-bad-status | 364 + www.elibrary.ru | no-pdf-link | 216 + www.emerald.com | | 862 + www.emerald.com | no-pdf-link | 724 + www.endocrine-abstracts.org | | 1907 + www.endocrine-abstracts.org | no-pdf-link | 1905 + www.eurekaselect.com | | 285 + www.eurekaselect.com | link-loop | 246 + www.even3.com.br | | 233 + www.frontiersin.org | | 585 + www.frontiersin.org | spn2-backoff | 436 + www.humankineticslibrary.com | no-pdf-link | 207 + www.humankineticslibrary.com | | 207 + www.igi-global.com | | 1600 + www.igi-global.com | no-pdf-link | 1199 + www.igi-global.com | bad-redirect | 258 + www.inderscience.com | | 385 + www.inderscience.com | no-pdf-link | 365 + www.inderscienceonline.com | | 202 + www.ingentaconnect.com | | 450 + www.ingentaconnect.com | no-pdf-link | 260 + www.jstage.jst.go.jp | | 1248 + www.jstage.jst.go.jp | success | 870 + www.karger.com | | 313 + www.liebertpub.com | | 271 + www.liebertpub.com | no-pdf-link | 241 + www.nicecjournal.co.uk | | 274 + www.nicecjournal.co.uk | success | 274 + www.nomos-elibrary.de | | 1771 + www.nomos-elibrary.de | no-pdf-link | 788 + www.nomos-elibrary.de | redirect-loop | 506 + www.nomos-elibrary.de | bad-redirect | 207 + www.osti.gov | | 381 + www.osti.gov | link-loop | 326 + www.persee.fr | | 277 + www.preprints.org | | 225 + www.preprints.org | success | 225 + www.protocols.io | | 770 + www.protocols.io | success | 485 + www.repository.cam.ac.uk | | 510 + www.repository.cam.ac.uk | bad-redirect | 213 + www.research-collection.ethz.ch | | 416 + www.research-collection.ethz.ch | bad-redirect | 249 + www.researchsquare.com | | 1121 + www.researchsquare.com | bad-redirect | 985 + www.scielo.br | | 828 + www.scielo.br | success | 641 + www.sciencedirect.com | | 8567 + www.sciencedirect.com | terminal-bad-status | 5773 + www.sciencedirect.com | spn2-wayback-error | 1590 + www.sciencedirect.com | no-pdf-link | 576 + www.sciencedirect.com | spn2-backoff | 479 + www.sciendo.com | | 257 + www.sciendo.com | success | 222 + www.scitepress.org | | 381 + www.scitepress.org | no-pdf-link | 377 + www.spiedigitallibrary.org | | 1061 + www.spiedigitallibrary.org | bad-redirect | 571 + www.spiedigitallibrary.org | gateway-timeout | 233 + www.tandfonline.com | | 4934 + www.tandfonline.com | no-pdf-link | 2088 + www.tandfonline.com | terminal-bad-status | 1282 + www.tandfonline.com | blocked-cookie | 757 + www.tandfonline.com | redirect-loop | 488 + www.tandfonline.com | spn2-wayback-error | 202 + www.taylorfrancis.com | | 3979 + www.taylorfrancis.com | link-loop | 1928 + www.taylorfrancis.com | no-pdf-link | 1840 + www.techniques-ingenieur.fr | | 354 + www.techniques-ingenieur.fr | no-pdf-link | 353 + www.thieme-connect.de | | 1987 + www.thieme-connect.de | no-pdf-link | 949 + www.thieme-connect.de | link-loop | 869 + www.tib.eu | no-pdf-link | 315 + www.tib.eu | | 315 + www.un-ilibrary.org | no-pdf-link | 352 + www.un-ilibrary.org | | 352 + www.worldscientific.com | | 668 + www.worldscientific.com | no-pdf-link | 629 + www.zora.uzh.ch | | 318 + zenodo.org | | 46585 + zenodo.org | no-pdf-link | 29519 + zenodo.org | success | 14768 + zenodo.org | terminal-bad-status | 810 + zenodo.org | wrong-mimetype | 691 + zenodo.org | spn2-cdx-lookup-failure | 395 + zenodo.org | spn2-backoff | 294 + zivahub.uct.ac.za | | 1909 + zivahub.uct.ac.za | no-pdf-link | 1880 + zop.zb.uzh.ch | | 228 + zop.zb.uzh.ch | success | 217 + | | 365582 + | success | 141497 38.7% + | no-pdf-link | 120852 33.0% + | terminal-bad-status | 31900 8.7% + | spn2-backoff | 16979 4.6% + | link-loop | 13624 3.7% + | bad-redirect | 8736 + | redirect-loop | 7405 + | gateway-timeout | 6997 + | spn2-cdx-lookup-failure | 5146 + | spn2-wayback-error | 3708 + | wrong-mimetype | 2158 + | blocked-cookie | 1942 + | spn2-error:blocked-url | 1733 + | wayback-error | 1063 + | spn2-error | 647 + | spn2-error:500 | 265 + | cdx-error | 257 +(383 rows) + +---- + +365k in 7 days is about 52k a day, which is about expected. Around 5-7% need +retries. + +important changes: +- biorxiv.org: needs fix and then retries +- academic.oup.com: should probably skip +- apps.crossref.org: need to handle this in code +- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/` +- doi.org: investigate redirect-loop and terminal-bad-status +- osf.io: not getting PDFs +- papers.ssrn.com: why are these attempted? +- publons.com: not getting PDFs; special case these? +- www.sciencedirect.com: not working at all? + +smaller: +- bridges.monash.edu: fix, then retry? +- dl.acm.org: some broader retries? +- figshare.com: still some attempts, but almost all no-pdf-link +- onlinelibrary.wiley.com: getting blocked broadly? +- www.endocrine-abstracts.org: HTML content? +- www.igi-global.com: no-pdf-link diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 62 GB | 44 GB | 106 GB + "public"."ingest_request" | 51 GB | 50 GB | 101 GB + "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB + "public"."file_meta" | 39 GB | 39 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7469 MB | 66 MB | 7535 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 198175106 | 282695671015403 + (1 row) + + 198 million files, 282 TBytes. + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 197021437 + text/html | 830331 + application/octet-stream | 186669 + application/xml | 42170 + application/xhtml+xml | 38207 + text/plain | 16471 + application/jats+xml | 10385 + application/gzip | 6681 + | 6032 + application/postscript | 4916 + image/jpeg | 4522 + application/vnd.ms-powerpoint | 1672 + application/msword | 946 + application/x-bzip2 | 891 + image/png | 659 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 + application/x-compress | 274 + video/mp4 | 218 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12800 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 137283420 | 172140506 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 157465613 + warc/revisit | 11337336 + text/html | 1137208 + application/octet-stream | 950380 + text/xml | 528965 + unk | 253294 + application/postscript | 81130 + application/save | 81069 + binary/octet-stream | 68942 + application/x-download | 42717 + application/download | 40628 + image/pdf | 39904 + text/plain | 36445 + application/force-download | 24148 + multipart/form-data | 10972 + application | 5409 + application/x-octetstream | 5192 + application/x-msdownload | 3854 + .pdf | 3518 + application/x-pdf | 3061 + application/octet | 1792 + pdf | 1757 + application/binary | 1399 + file | 1373 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1087 + application/unknown | 1005 + 0 | 773 + text/pdf | 729 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 129001717 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 120797098 + 500 | 8198783 + -4 | 5802 + 503 | 36 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 60469462 + 0.5.5-fatcat | 47472904 + | 12665498 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | doi | 43852308 + pdf | mag | 43701948 + pdf | doaj | 6534341 + html | doaj | 3987669 + pdf | arxiv | 2784589 + pdf | pmc | 2439181 + pdf | dblp | 631716 + html | doi | 126699 + xml | doaj | 23066 + pdf | cnki_covid19 | 2034 + pdf | spn | 1026 + pdf | wanfang_covid19 | 975 + html | spn | 65 + xml | spn | 2 + xml | doi | 1 + (17 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 24742500 + pdf | doi | fatcat-ingest | 15592121 + pdf | doaj | doaj | 6484737 + html | doaj | doaj | 3987468 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 799793 + pdf | dblp | dblp | 631716 + pdf | pmc | fatcat-ingest | 297980 + html | doi | fatcat-ingest | 121508 + pdf | pmc | fatcat-changelog | 112376 + pdf | doaj | fatcat-changelog | 47181 + xml | doaj | doaj | 23066 + html | doi | fatcat-changelog | 5129 + pdf | doaj | fatcat-ingest | 2423 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1814 + pdf | spn | savepapernow-web | 1026 + pdf | wanfang_covid19 | scrape-covid19 | 975 + html | doaj | fatcat-ingest | 201 + html | spn | savepapernow-web | 65 + html | doi | savepapernow-web | 62 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 4 + xml | spn | savepapernow-web | 2 + xml | doi | savepapernow-web | 1 + (30 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 167653 + pdf | doaj | 81517 + pdf | oai | 15282 + html | doaj | 1791 + pdf | unpaywall | 270 + pdf | doi | 22 + (6 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 16024068 | 0.313 + pdf | unpaywall | 43932525 | 36045446 | 0.820 + pdf | doi | 43852308 | 14956080 | 0.341 + pdf | mag | 43701948 | 32768484 | 0.750 + pdf | doaj | 6534341 | 4704066 | 0.720 + html | doaj | 3987669 | 778165 | 0.195 + pdf | arxiv | 2784589 | 2419941 | 0.869 + pdf | pmc | 2439181 | 1897671 | 0.778 + pdf | dblp | 631716 | 305142 | 0.483 + html | doi | 126699 | 75754 | 0.598 + xml | doaj | 23066 | 10381 | 0.450 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | spn | 1026 | 778 | 0.758 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + html | spn | 65 | 13 | 0.200 + xml | spn | 2 | 1 | 0.500 + xml | doi | 1 | 0 | 0.000 + (17 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 94887295 + pdf | no-pdf-link | 33960080 + pdf | no-capture | 20893916 + pdf | terminal-bad-status | 6973765 + pdf | redirect-loop | 5775175 + pdf | link-loop | 4095424 + pdf | skip-url-blocklist | 4037518 + pdf | blocked-cookie | 3508762 + html | wrong-scope | 1783694 + pdf | wrong-mimetype | 1379673 + html | success | 853762 + pdf | gateway-timeout | 635170 + html | no-capture | 381283 + pdf | wayback-content-error | 356694 + pdf | cdx-error | 347700 + pdf | null-body | 336166 + html | unknown-scope | 321874 + html | html-resource-no-capture | 294294 + pdf | forbidden | 291127 + pdf | not-found | 274343 + pdf | too-many-redirects | 264494 + component | wrong-mimetype | 196680 + component | spn2-cdx-lookup-failure | 173615 + component | spn2-backoff | 115840 + html | terminal-bad-status | 106264 + html | null-body | 100296 + pdf | wayback-error | 94748 + html | blocked-cookie | 88537 + component | no-capture | 75278 + pdf | empty-blob | 61157 + pdf | bad-redirect | 58680 + pdf | skip-wall | 57751 + pdf | spn2-error:too-many-redirects | 52873 + html | spn2-backoff | 50577 + pdf | remote-server-error | 41282 + pdf | invalid-host-resolution | 38864 + pdf | read-timeout | 37071 + pdf | spn2-cdx-lookup-failure | 34229 + html | wrong-mimetype | 33643 + pdf | spn2-backoff | 32437 + pdf | petabox-error | 31006 + html | wayback-content-error | 28034 + component | spn2-error | 27044 + pdf | spn2-error:unknown | 25810 + component | gateway-timeout | 25215 + pdf | body-too-large | 21721 + html | petabox-error | 18313 + html | empty-blob | 14393 + html | redirect-loop | 13404 + component | blocked-cookie | 12287 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 45052391 + pdf | | 26117481 + pdf | 301 | 4814786 + html | 200 | 2684821 + pdf | 403 | 1871088 + pdf | 404 | 1254259 + pdf | 302 | 898728 + pdf | 503 | 867548 + pdf | 401 | 851205 + pdf | 429 | 741869 + pdf | 400 | 624519 + component | | 456915 + html | | 442051 + pdf | 500 | 283700 + component | 200 | 197510 + pdf | 410 | 120647 + pdf | 303 | 107947 + html | 404 | 80114 + pdf | 420 | 26722 + pdf | 502 | 19500 + pdf | 409 | 15499 + html | 429 | 15208 + pdf | 509 | 15167 + pdf | 999 | 12186 + pdf | 202 | 11535 + html | 301 | 10213 + xml | | 10018 + pdf | 307 | 8657 + pdf | 402 | 8338 + pdf | 412 | 8064 + pdf | 308 | 6479 + html | 500 | 4746 + xml | 200 | 2668 + pdf | 520 | 2496 + html | 302 | 2289 + pdf | 521 | 2257 + html | 202 | 2177 + pdf | 206 | 1961 + html | 403 | 1775 + pdf | 504 | 1187 + pdf | 421 | 1148 + html | 303 | 1112 + pdf | 406 | 1109 + pdf | 204 | 772 + pdf | 432 | 745 + pdf | 405 | 633 + html | 400 | 632 + pdf | 426 | 515 + pdf | 508 | 503 + pdf | 505 | 469 + (50 rows) diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt new file mode 100644 index 0000000..0a6254a --- /dev/null +++ b/sql/stats/2022-11-23_table_sizes.txt @@ -0,0 +1,21 @@ +PostgreSQL 13.2 - wbgrp-svc506.us.archive.org +Size: 1.13T + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 63 GB | 45 GB | 108 GB + "public"."ingest_request" | 53 GB | 52 GB | 105 GB + "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB + "public"."file_meta" | 39 GB | 40 GB | 79 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7879 MB | 68 MB | 7947 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes diff --git a/sql/stats/README.md b/sql/stats/README.md index 52642f6..3161514 100644 --- a/sql/stats/README.md +++ b/sql/stats/README.md @@ -29,7 +29,7 @@ Counts and total file size: Top mimetypes: - SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; Missing full metadata: @@ -43,25 +43,21 @@ Total and unique-by-sha1 counts: mimetype counts: - SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; - -Processed or not: - - # TODO: + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; ## GROBID Counts: - SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + SELECT COUNT(*) AS total_files FROM grobid; Status? - SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; What version used? - SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; ## Petabox @@ -75,7 +71,7 @@ Requests by source: SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; - SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; Uncrawled requests by source: @@ -86,7 +82,7 @@ Uncrawled requests by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type WHERE ingest_file_result.base_url IS NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; Results by source: @@ -101,20 +97,13 @@ Results by source: ON ingest_request.base_url = ingest_file_result.base_url AND ingest_request.ingest_type = ingest_file_result.ingest_type AND ingest_file_result.ingest_type IS NOT NULL - GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; Ingest result by status: - SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; - -## Fatcat Files + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; -Count of PDF files that GROBID processed and matched to a release (via -glutton), but no PDF in `fatcat_file`: +Failed ingest by terminal status code: - SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count - FROM grobid - LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex - WHERE fatcat_file.sha1hex IS NULL - AND grobid.fatcat_release IS NOT NULL; + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; |