diff options
Diffstat (limited to 'sql')
55 files changed, 6826 insertions, 59 deletions
diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..860addb --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,35 @@ + +SHELL=/bin/bash -euo pipefail +TODAY ?= $(shell date --iso --utc) +DATADIR ?= /srv/sandcrawler/tasks/$(TODAY) +DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) +DATABASE_URL ?= sandcrawler + +.PHONY: help +help: ## Print info about all commands + @echo "Commands:" + @echo + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf " \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: create_datadir +create_datadir: + mkdir -p $(DATADIR)/ + sudo chmod a+rw $(DATADIR)/ + +$(DATADIR)/.DB_DUMP: + sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip + mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump + touch $@ + +.PHONY: database-snapshot +database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot + @echo + +$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP + ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))" + ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump + touch $@ + +.PHONY: upload-database-snapshot +upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org + @echo diff --git a/sql/README.md b/sql/README.md index b171614..e488006 100644 --- a/sql/README.md +++ b/sql/README.md @@ -5,6 +5,21 @@ No primary storage of anything in this table. Everything should be rapidly re-creatable from dumps, kafka topics (compressed), CDX, petabox metadata, etc. This is a secondary view on all of that. +## Create Database and User + +Create system user with your username like: + + sudo su postgres + createuser -s bnewbold + +Create database using `diesel` tool (see fatcat rust docs for install notes): + + # DANGER: will delete/recreate entire database + diesel database reset + +In the future would probably be better to create a real role/password and +supply these via `DATABASE_URL` env variable. + ## Schema schema/database name is 'sandcrawler' @@ -124,3 +139,30 @@ Questions we might want to answer http get :3030/cdx?url=eq.https://coleccionables.mercadolibre.com.ar/arduino-pdf_Installments_NoInterest_BestSellers_YES http get :3030/file_meta?sha1hex=eq.120582c855a7cc3c70a8527c560d7f27e6027278 + +## Full SQL Database Dumps + +Run a dump in compressed, postgres custom format, not including `crossref` table (which is large and redundant): + + export DATESLUG="`date +%Y-%m-%d.%H%M%S`" + time sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > sandcrawler_full_dbdump_${DATESLUG}.pgdump + +As of 2021-12-03, this process runs for about 6 hours and the compressed +snapshot is 102 GBytes (compared with 940GB database disk consumption, +including crossref). + +Then, upload to petabox as a backup: + + ia upload sandcrawler_full_dbdump_YYYY-MM-DD -m mediatype:data -m collection:webgroup-internal-backups -m title:"Sandcrawler SQL Dump (YYYY-MM-DD)" sandcrawler_full_dbdump_${DATESLUG}.pgdump + + +## SQL Database Restore + +To restore a dump (which will delete local database content, if any): + + sudo su postgres + createuser --no-login web_anon + createuser -s sandcrawler + time pg_restore --jobs=4 --verbose --clean --if-exists --create --exit-on-error -d postgres sandcrawler_full_dbdump_2021-04-08.003952.pgdump + +Took about 2.5 hours. diff --git a/sql/backfill/backfill.md b/sql/backfill/backfill.md index f1a5f86..4a56065 100644 --- a/sql/backfill/backfill.md +++ b/sql/backfill/backfill.md @@ -76,6 +76,19 @@ In psql: COPY fatcat_file FROM '/sandcrawler-db/backfill/fatcat_file.2019-07-07.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); # => COPY 24727350 +In 2021-11-26: + + zcat file_export.json.gz \ + | pv -l \ + | jq -r 'select(.sha1 != null) | [.sha1, .ident, .release_ids[0], (.urls|length >= 1), .content_scope] | @tsv' \ + | sort -S 8G \ + | uniq -w 40 \ + | pigz \ + > fatcat_file.2021-11-26.tsv.gz + + COPY fatcat_file FROM '/srv/sandcrawler/tasks/fatcat_file.2021-11-26.tsv' WITH (FORMAT TEXT, DELIMITER E'\t', NULL ''); + # COPY 112086814 + ## `file_meta` zcat /fast/download/file_export.2019-07-07.json.gz | pv -l | jq -r 'select(.md5 != null) | [.sha1, .sha256, .md5, .size, .mimetype] | @tsv' | sort -S 8G | uniq -w 40 > /sandcrawler-db/backfill/file_meta.2019-07-07.tsv diff --git a/sql/backfill/backfill_cdx.py b/sql/backfill/backfill_cdx.py index 1c452ca..f929502 100755 --- a/sql/backfill/backfill_cdx.py +++ b/sql/backfill/backfill_cdx.py @@ -109,6 +109,7 @@ def stdin_to_pg(): info = parse_cdx_line(l) if not info: continue + # XXX: filter to, eg, PDF or octet/stream (derp) batch.append(info) counts['total'] += 1 if len(batch) >= 1000: diff --git a/sql/dump_file_meta.sql b/sql/dump_file_meta.sql new file mode 100644 index 0000000..a7d6c2b --- /dev/null +++ b/sql/dump_file_meta.sql @@ -0,0 +1,12 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT sha1hex, row_to_json(file_meta) + FROM file_meta + ORDER BY sha1hex ASC +) +TO '/srv/sandcrawler/tasks/file_meta_dump.tsv' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_regrobid_pdf.sql b/sql/dump_regrobid_pdf.sql new file mode 100644 index 0000000..b846834 --- /dev/null +++ b/sql/dump_regrobid_pdf.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_regrobid_pdf.sql | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf.2019-11-12.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT cdx.sha1hex, row_to_json(cdx) FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) +) +TO STDOUT +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_regrobid_pdf_petabox.sql b/sql/dump_regrobid_pdf_petabox.sql new file mode 100644 index 0000000..e7c48f3 --- /dev/null +++ b/sql/dump_regrobid_pdf_petabox.sql @@ -0,0 +1,15 @@ + +-- Run like: +-- psql sandcrawler < dump_regrobid_pdf_petabox.sql +-- cat dump_regrobid_pdf_petabox.2020-02-03.json | sort -S 4G | uniq -w 40 | cut -f2 > dump_regrobid_pdf_petabox.2020-02-03.uniq.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT petabox.sha1hex, row_to_json(petabox) FROM petabox + WHERE EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.grobid_version IS NULL) +) +TO '/srv/sandcrawler/tasks/dump_regrobid_pdf_petabox.2020-02-03.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql new file mode 100644 index 0000000..698db7a --- /dev/null +++ b/sql/dump_reingest_bulk.sql @@ -0,0 +1,31 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '24 hour'::INTERVAL + AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql new file mode 100644 index 0000000..7473420 --- /dev/null +++ b/sql/dump_reingest_old.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 day'::INTERVAL + -- AND ingest_request.created > NOW() - '181 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status like 'no-capture' + -- OR ingest_file_result.status like 'cdx-error' + -- OR ingest_file_result.status like 'petabox-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql new file mode 100644 index 0000000..dbeb199 --- /dev/null +++ b/sql/dump_reingest_quarterly.sql @@ -0,0 +1,47 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '91 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container' + OR ingest_request.ingest_request_source = 'unpaywall' + OR ingest_request.ingest_request_source = 'arxiv' + OR ingest_request.ingest_request_source = 'pmc' + OR ingest_request.ingest_request_source = 'doaj' + OR ingest_request.ingest_request_source = 'dblp') + AND ( + ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_quarterly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql new file mode 100644 index 0000000..a83125c --- /dev/null +++ b/sql/dump_reingest_spn.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '6 hour'::INTERVAL + AND ingest_request.created > NOW() - '180 day'::INTERVAL + AND ingest_request.ingest_request_source = 'savepapernow-web' + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql new file mode 100644 index 0000000..b72a096 --- /dev/null +++ b/sql/dump_reingest_terminalstatus.sql @@ -0,0 +1,34 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_file_result.hit = false + AND ingest_request.created < NOW() - '72 hour'::INTERVAL + AND ingest_request.created > NOW() - '10 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + AND ingest_file_result.status = 'terminal-bad-status' + AND ( + ingest_file_result.terminal_status_code = 500 + OR ingest_file_result.terminal_status_code = 502 + OR ingest_file_result.terminal_status_code = 503 + OR ingest_file_result.terminal_status_code = 429 + OR ingest_file_result.terminal_status_code = 404 + ) + AND ( + ingest_request.base_url LIKE 'https://doi.org/10.3390/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' + OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' + ) +) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql new file mode 100644 index 0000000..a019938 --- /dev/null +++ b/sql/dump_reingest_weekly.sql @@ -0,0 +1,42 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON + ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'html' + OR ingest_request.ingest_type = 'xml' + OR ingest_request.ingest_type = 'component') + AND ingest_file_result.hit = false + AND ingest_request.created < NOW() - '8 hour'::INTERVAL + AND ingest_request.created > NOW() - '8 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest' + OR ingest_request.ingest_request_source = 'fatcat-ingest-container') + AND ( + ingest_file_result.status like 'spn2-%' + -- OR ingest_file_result.status = 'cdx-error' + -- OR ingest_file_result.status = 'wayback-error' + -- OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + -- OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:filesize-limit' + AND ingest_file_result.status != 'spn2-error:not-found' + AND ingest_file_result.status != 'spn2-error:blocked-url' + AND ingest_file_result.status != 'spn2-error:too-many-redirects' + AND ingest_file_result.status != 'spn2-error:network-authentication-required' + AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_weekly_current.rows.json'; + +-- bulk re-tries would be: +-- AND (ingest_request.ingest_request_source != 'fatcat-changelog' +-- AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_unextracted_pdf.sql b/sql/dump_unextracted_pdf.sql new file mode 100644 index 0000000..a7fb920 --- /dev/null +++ b/sql/dump_unextracted_pdf.sql @@ -0,0 +1,22 @@ + +-- Run like: +-- psql sandcrawler < dump_unextracted_pdf.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM grobid + LEFT JOIN cdx ON grobid.sha1hex = cdx.sha1hex + --LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + LEFT JOIN ingest_file_result ON grobid.sha1hex = ingest_file_result.terminal_sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE cdx.sha1hex IS NOT NULL + --AND fatcat_file.sha1hex IS NOT NULL + AND ingest_file_result.terminal_sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL +) +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf.ingest.2020-10-21.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_unextracted_pdf_petabox.sql b/sql/dump_unextracted_pdf_petabox.sql new file mode 100644 index 0000000..bb9f162 --- /dev/null +++ b/sql/dump_unextracted_pdf_petabox.sql @@ -0,0 +1,18 @@ + +-- Run like: +-- psql sandcrawler < dump_unextracted_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM grobid + LEFT JOIN petabox ON grobid.sha1hex = petabox.sha1hex + LEFT JOIN pdf_meta ON grobid.sha1hex = pdf_meta.sha1hex + WHERE petabox.sha1hex IS NOT NULL + AND pdf_meta.sha1hex IS NULL +) +TO '/srv/sandcrawler/tasks/dump_unextracted_pdf_petabox.2020-07-22.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf.sql b/sql/dump_ungrobid_pdf.sql new file mode 100644 index 0000000..81caf18 --- /dev/null +++ b/sql/dump_ungrobid_pdf.sql @@ -0,0 +1,18 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (cdx.sha1hex) row_to_json(cdx) + FROM cdx + WHERE cdx.mimetype = 'application/pdf' + AND NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + --AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE cdx.sha1hex = fatcat_file.sha1hex) +) +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf.fatcat.2020-08-04.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_ungrobid_pdf_petabox.sql b/sql/dump_ungrobid_pdf_petabox.sql new file mode 100644 index 0000000..b7a1db2 --- /dev/null +++ b/sql/dump_ungrobid_pdf_petabox.sql @@ -0,0 +1,17 @@ + +-- Run like: +-- psql sandcrawler < dump_ungrobid_pdf_petabox.sql + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT DISTINCT ON (petabox.sha1hex) row_to_json(petabox) + FROM petabox + WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE petabox.sha1hex = grobid.sha1hex AND grobid.status IS NOT NULL) + -- uncomment/comment this to control whether only fatcat files are included + AND EXISTS (SELECT fatcat_file.sha1hex FROM fatcat_file WHERE petabox.sha1hex = fatcat_file.sha1hex) +) +TO '/srv/sandcrawler/tasks/dump_ungrobided_pdf_petabox.2020-08-04.json' +WITH NULL ''; + +ROLLBACK; diff --git a/sql/dump_unmatched_glutton_pdf.sql b/sql/dump_unmatched_glutton_pdf.sql new file mode 100644 index 0000000..333ff7b --- /dev/null +++ b/sql/dump_unmatched_glutton_pdf.sql @@ -0,0 +1,19 @@ + +-- Run like: +-- psql sandcrawler < THING.sql > THING.2019-09-23.json + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( + SELECT row_to_json(grobid) + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL + LIMIT 1000 +) +TO '/srv/sandcrawler/tasks/dump_unmatched_glutton_pdf.2020-06-30.json'; +--TO STDOUT +--WITH NULL ''; + +ROLLBACK; diff --git a/sql/example.env b/sql/example.env new file mode 100644 index 0000000..3a13689 --- /dev/null +++ b/sql/example.env @@ -0,0 +1 @@ +DATABASE_URL="postgres://fatcat:tactaf@localhost/sandcrawler" diff --git a/sql/ingest_again.md b/sql/ingest_again.md new file mode 100644 index 0000000..b749557 --- /dev/null +++ b/sql/ingest_again.md @@ -0,0 +1,158 @@ + +## re-ingest some broken + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'spn2-%' + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' + ) TO '/srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'cdx-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source != 'fatcat-changelog' + AND ingest_request.ingest_request_source != 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'wayback-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + ) TO '/srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'gateway-timeout' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json'; + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.hit = false + AND ingest_file_result.status like 'petabox-error' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL + AND (ingest_request.ingest_request_source = 'fatcat-changelog' + OR ingest_request.ingest_request_source = 'fatcat-ingest') + ) TO '/srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json'; + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json + +Push to kafka (shuffled): + + cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + + cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0 + + cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Push to kafka (not shuffled): + + cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## just recent fatcat-ingest + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_file_result.ingest_type = 'pdf' + AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL + -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL + AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL + AND ingest_file_result.hit = false + AND (ingest_file_result.status like 'spn2-%' + OR ingest_file_result.status like 'cdx-error' + OR ingest_file_result.status like 'gateway-timeout' + OR ingest_file_result.status like 'wayback-error' + ) + AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' + AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) TO '/srv/sandcrawler/tasks/reingest_fatcat_current.rows.json'; + + # note: shuf + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json + + cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## specific domains + +protocols.io: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url LIKE '%10.17504/protocols.io%' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +biorxiv/medrxiv: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.ingest_type = 'pdf' + AND ingest_request.base_url LIKE '%10.1101/20%' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt new file mode 100644 index 0000000..b684400 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_doi_prefix.txt @@ -0,0 +1,326 @@ + doi_prefix | status | count +------------+-------------------------------+-------- + 10.1001 | | 230 + 10.1002 | | 3914 + 10.1002 | terminal-bad-status | 1540 + 10.1002 | forbidden | 1072 + 10.1002 | redirect-loop | 995 + 10.1002 | no-pdf-link | 210 + 10.1016 | | 7976 + 10.1016 | no-pdf-link | 4648 + 10.1016 | terminal-bad-status | 1778 + 10.1016 | forbidden | 622 + 10.1016 | spn2-error:too-many-redirects | 344 + 10.1016 | redirect-loop | 225 + 10.1017 | | 2040 + 10.1017 | no-pdf-link | 720 + 10.1017 | success | 441 + 10.1017 | link-loop | 371 + 10.1017 | bad-redirect | 227 + 10.1021 | | 1722 + 10.1021 | blocked-cookie | 1552 + 10.1029 | | 248 + 10.1039 | | 1160 + 10.1039 | redirect-loop | 486 + 10.1039 | spn2-error:too-many-redirects | 395 + 10.1039 | spn2-wayback-error | 213 + 10.1051 | | 695 + 10.1051 | success | 557 + 10.1055 | | 541 + 10.1055 | not-found | 295 + 10.1055 | redirect-loop | 213 + 10.1057 | | 2835 + 10.1057 | redirect-loop | 2617 + 10.1061 | | 550 + 10.1061 | spn2-error:too-many-redirects | 425 + 10.1063 | | 600 + 10.1063 | spn2-error:too-many-redirects | 328 + 10.1080 | | 3801 + 10.1080 | blocked-cookie | 2431 + 10.1080 | terminal-bad-status | 711 + 10.1080 | forbidden | 341 + 10.1081 | | 299 + 10.1081 | link-loop | 222 + 10.1089 | | 236 + 10.1089 | blocked-cookie | 228 + 10.1093 | | 12805 + 10.1093 | link-loop | 8627 + 10.1093 | redirect-loop | 1659 + 10.1093 | no-pdf-link | 1475 + 10.1093 | bad-redirect | 428 + 10.1093 | success | 391 + 10.1097 | | 1497 + 10.1097 | no-pdf-link | 503 + 10.1097 | link-loop | 346 + 10.1097 | spn2-error:too-many-redirects | 259 + 10.1097 | terminal-bad-status | 202 + 10.1101 | | 1859 + 10.1101 | redirect-loop | 993 + 10.1101 | forbidden | 703 + 10.1103 | | 597 + 10.1103 | not-found | 534 + 10.1108 | | 1055 + 10.1108 | no-pdf-link | 945 + 10.1109 | | 7067 + 10.1109 | spn2-error:too-many-redirects | 6299 + 10.1109 | success | 667 + 10.1111 | | 2099 + 10.1111 | redirect-loop | 1331 + 10.1111 | terminal-bad-status | 313 + 10.1111 | forbidden | 226 + 10.1115 | | 1278 + 10.1115 | bad-redirect | 707 + 10.1117 | | 561 + 10.1117 | spn2-error:too-many-redirects | 501 + 10.1126 | | 214 + 10.1136 | | 1989 + 10.1136 | success | 1463 + 10.1136 | link-loop | 294 + 10.1142 | | 300 + 10.1142 | blocked-cookie | 237 + 10.1145 | | 440 + 10.1145 | blocked-cookie | 354 + 10.1155 | | 480 + 10.1155 | success | 474 + 10.11588 | | 506 + 10.11588 | no-pdf-link | 264 + 10.11588 | success | 236 + 10.1159 | | 226 + 10.11606 | | 304 + 10.1161 | | 1142 + 10.1161 | blocked-cookie | 1011 + 10.1163 | | 2261 + 10.1163 | link-loop | 1767 + 10.1163 | success | 348 + 10.11648 | | 405 + 10.11648 | success | 404 + 10.1182 | | 2125 + 10.1182 | no-pdf-link | 2024 + 10.1183 | | 987 + 10.1183 | redirect-loop | 838 + 10.1186 | | 1481 + 10.1186 | success | 1412 + 10.1201 | | 7649 + 10.1201 | link-loop | 5383 + 10.1201 | forbidden | 1504 + 10.1201 | no-pdf-link | 312 + 10.1299 | | 264 + 10.1299 | no-pdf-link | 209 + 10.13134 | | 201 + 10.1353 | | 549 + 10.1353 | terminal-bad-status | 443 + 10.1371 | | 552 + 10.1371 | success | 542 + 10.14201 | | 656 + 10.14201 | success | 366 + 10.14361 | | 647 + 10.14361 | link-loop | 585 + 10.14746 | | 260 + 10.14746 | success | 232 + 10.1504 | | 527 + 10.1504 | no-pdf-link | 501 + 10.15122 | | 246 + 10.15122 | success | 243 + 10.1515 | | 16240 + 10.1515 | link-loop | 12589 + 10.1515 | success | 1941 + 10.1515 | no-pdf-link | 1008 + 10.1515 | not-found | 283 + 10.15405 | | 229 + 10.15405 | success | 218 + 10.1553 | | 418 + 10.1553 | no-pdf-link | 396 + 10.1590 | | 655 + 10.1590 | success | 623 + 10.17104 | | 1202 + 10.17104 | no-pdf-link | 953 + 10.17104 | bad-redirect | 249 + 10.17605 | | 368 + 10.17605 | not-found | 337 + 10.17615 | | 9401 + 10.17615 | redirect-loop | 5720 + 10.17615 | spn2-wayback-error | 3099 + 10.17615 | spn2-cdx-lookup-failure | 201 + 10.17863 | | 438 + 10.18148 | | 465 + 10.18148 | success | 462 + 10.18720 | | 210 + 10.18821 | | 476 + 10.18821 | redirect-loop | 366 + 10.20345 | | 222 + 10.20345 | terminal-bad-status | 215 + 10.20546 | | 244 + 10.20546 | no-pdf-link | 241 + 10.21037 | | 232 + 10.2118 | | 903 + 10.2118 | redirect-loop | 853 + 10.21203 | | 1824 + 10.21203 | success | 1545 + 10.2139 | | 1493 + 10.2139 | link-loop | 1145 + 10.2147 | | 318 + 10.2147 | success | 267 + 10.2172 | | 282 + 10.2174 | | 363 + 10.2174 | no-pdf-link | 320 + 10.2196 | | 265 + 10.2208 | | 299 + 10.22215 | | 218 + 10.22215 | success | 217 + 10.22323 | | 289 + 10.22323 | success | 262 + 10.22533 | | 395 + 10.22533 | success | 393 + 10.22541 | | 291 + 10.22541 | success | 275 + 10.23919 | | 426 + 10.23919 | spn2-error:too-many-redirects | 403 + 10.24034 | | 319 + 10.24034 | spn2-error | 203 + 10.24355 | | 15360 + 10.24355 | no-pdf-link | 15228 + 10.24411 | | 1506 + 10.24411 | forbidden | 823 + 10.24411 | redirect-loop | 647 + 10.25335 | | 550 + 10.25335 | no-pdf-link | 550 + 10.25365 | | 429 + 10.25365 | success | 424 + 10.25384 | | 338 + 10.25384 | success | 249 + 10.25646 | | 239 + 10.26197 | no-pdf-link | 303 + 10.26197 | | 303 + 10.26226 | | 272 + 10.26278 | | 1291 + 10.26278 | redirect-loop | 756 + 10.26278 | spn2-error:too-many-redirects | 509 + 10.29327 | | 232 + 10.2991 | | 307 + 10.2991 | spn2-wayback-error | 227 + 10.30965 | | 722 + 10.30965 | link-loop | 709 + 10.3109 | | 801 + 10.3109 | link-loop | 572 + 10.3109 | forbidden | 228 + 10.31219 | | 951 + 10.31219 | redirect-loop | 518 + 10.31219 | spn2-wayback-error | 356 + 10.31274 | | 296 + 10.31743 | | 403 + 10.31743 | success | 294 + 10.31857 | | 209 + 10.3233 | | 471 + 10.33448 | | 213 + 10.33448 | success | 212 + 10.3389 | | 1459 + 10.3389 | success | 1417 + 10.3390 | | 4511 + 10.3390 | success | 3577 + 10.3390 | terminal-bad-status | 485 + 10.3390 | forbidden | 379 + 10.3406 | | 243 + 10.3406 | terminal-bad-status | 213 + 10.34944 | | 527 + 10.34944 | success | 459 + 10.35016 | | 688 + 10.35016 | no-pdf-link | 687 + 10.36347 | success | 213 + 10.36347 | | 213 + 10.37747 | | 213 + 10.37747 | no-pdf-link | 213 + 10.37904 | | 227 + 10.37904 | no-pdf-link | 226 + 10.3917 | | 347 + 10.3917 | redirect-loop | 208 + 10.3923 | | 356 + 10.3923 | redirect-loop | 254 + 10.3929 | | 317 + 10.3929 | terminal-bad-status | 310 + 10.3931 | | 279 + 10.3931 | no-pdf-link | 279 + 10.4000 | | 7828 + 10.4000 | success | 3485 + 10.4000 | spn2-wayback-error | 2142 + 10.4000 | redirect-loop | 2106 + 10.4018 | | 249 + 10.4018 | not-found | 240 + 10.4103 | | 726 + 10.4103 | remote-server-error | 343 + 10.4103 | redirect-loop | 324 + 10.4159 | | 286 + 10.4159 | link-loop | 238 + 10.4324 | | 19398 + 10.4324 | link-loop | 12471 + 10.4324 | forbidden | 3632 + 10.4324 | not-found | 2283 + 10.4324 | terminal-bad-status | 645 + 10.4324 | success | 208 + 10.47295 | | 456 + 10.47295 | success | 449 + 10.47513 | | 218 + 10.47513 | no-pdf-link | 203 + 10.48084 | success | 538 + 10.48084 | | 538 + 10.5040 | | 375 + 10.5040 | no-pdf-link | 365 + 10.5167 | | 290 + 10.5167 | redirect-loop | 278 + 10.5169 | | 360 + 10.5169 | no-pdf-link | 355 + 10.5194 | | 917 + 10.5194 | success | 887 + 10.5216 | | 213 + 10.5220 | no-pdf-link | 397 + 10.5220 | | 397 + 10.5281 | | 22551 + 10.5281 | terminal-bad-status | 12158 + 10.5281 | success | 4901 + 10.5281 | no-pdf-link | 4754 + 10.5281 | spn2-error:unknown | 360 + 10.5282 | | 228 + 10.5451 | | 2068 + 10.5451 | success | 1071 + 10.5451 | terminal-bad-status | 817 + 10.5753 | | 268 + 10.5753 | success | 264 + 10.5771 | | 941 + 10.5771 | no-pdf-link | 397 + 10.5771 | bad-redirect | 269 + 10.5771 | link-loop | 238 + 10.6068 | | 441 + 10.6068 | no-pdf-link | 384 + 10.6084 | | 917 + 10.6084 | no-pdf-link | 520 + 10.6084 | success | 368 + 10.7287 | | 234 + 10.7287 | no-pdf-link | 212 + 10.7312 | | 382 + 10.7312 | link-loop | 291 + 10.7554 | | 205 + 10.7891 | | 380 + 10.7891 | no-pdf-link | 376 + 10.7916 | | 331 + 10.7916 | no-pdf-link | 201 + 10.7939 | | 535 + 10.7939 | no-pdf-link | 527 + | | 272831 + | success | 62298 + | no-pdf-link | 60737 + | link-loop | 48558 + | redirect-loop | 26842 + | terminal-bad-status | 22685 + | spn2-error:too-many-redirects | 11174 + | forbidden | 10900 + | spn2-wayback-error | 7796 + | blocked-cookie | 6961 + | not-found | 5468 + | bad-redirect | 2666 + | spn2-error | 2398 + | spn2-cdx-lookup-failure | 1374 + | petabox-error | 678 + | remote-server-error | 461 + | wrong-mimetype | 443 + | spn2-error:proxy-error | 420 + | spn2-error:unknown | 360 +(323 rows) diff --git a/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt new file mode 100644 index 0000000..28dd0d0 --- /dev/null +++ b/sql/ingest_stats/2020-11-16_weekly_ingest_terminal_domain.txt @@ -0,0 +1,307 @@ + domain | status | count +-------------------------------------------------------------------+-------------------------------+-------- + 202.148.31.178 | | 298 + academic.oup.com | | 1624 + academic.oup.com | no-pdf-link | 673 + academic.oup.com | bad-redirect | 444 + academic.oup.com | link-loop | 358 + aip.scitation.org | | 257 + apps.crossref.org | | 1414 + apps.crossref.org | no-pdf-link | 1410 + article.sciencepublishinggroup.com | | 404 + article.sciencepublishinggroup.com | success | 404 + arxiv.org | | 24340 + arxiv.org | success | 22381 + arxiv.org | terminal-bad-status | 1260 + arxiv.org | no-pdf-link | 412 + arxiv.org | no-capture | 262 + ashpublications.org | | 2049 + ashpublications.org | no-pdf-link | 2024 + asmedigitalcollection.asme.org | | 1245 + asmedigitalcollection.asme.org | bad-redirect | 707 + assets.researchsquare.com | | 1549 + assets.researchsquare.com | success | 1546 + bioone.org | | 201 + biorxiv.org | redirect-loop | 702 + biorxiv.org | | 702 + blogs.ethz.ch | | 687 + blogs.ethz.ch | no-pdf-link | 686 + books.openedition.org | | 446 + books.openedition.org | redirect-loop | 382 + brill.com | | 2203 + brill.com | link-loop | 1779 + brill.com | success | 359 + catalog.paradisec.org.au | | 770 + catalog.paradisec.org.au | redirect-loop | 756 + cdr.lib.unc.edu | | 9432 + cdr.lib.unc.edu | redirect-loop | 5720 + cdr.lib.unc.edu | spn2-wayback-error | 3187 + cdr.lib.unc.edu | spn2-cdx-lookup-failure | 201 + classiques-garnier.com | | 246 + classiques-garnier.com | success | 243 + content.iospress.com | | 242 + content.taylorfrancis.com | | 309 + content.taylorfrancis.com | terminal-bad-status | 309 + curve.carleton.ca | success | 201 + curve.carleton.ca | | 201 + cyberdoi.ru | redirect-loop | 647 + cyberdoi.ru | | 647 + czasopisma.kul.pl | | 402 + czasopisma.kul.pl | success | 294 + d.lib.msu.edu | | 550 + d.lib.msu.edu | no-pdf-link | 550 + d197for5662m48.cloudfront.net | success | 276 + d197for5662m48.cloudfront.net | | 276 + dergipark.org.tr | | 674 + dergipark.org.tr | no-pdf-link | 255 + dergipark.org.tr | success | 248 + digi.ub.uni-heidelberg.de | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 261 + dl.acm.org | | 441 + dl.acm.org | blocked-cookie | 361 + dlc.library.columbia.edu | | 201 + dlc.library.columbia.edu | no-pdf-link | 201 + doi.ala.org.au | | 308 + doi.ala.org.au | no-pdf-link | 308 + doi.org | | 474 + doi.org | terminal-bad-status | 344 + downloads.hindawi.com | | 479 + downloads.hindawi.com | success | 478 + edoc.rki.de | | 238 + edoc.unibas.ch | | 2018 + edoc.unibas.ch | success | 1067 + edoc.unibas.ch | terminal-bad-status | 817 + elib.spbstu.ru | | 205 + elifesciences.org | | 204 + era.library.ualberta.ca | | 531 + era.library.ualberta.ca | no-pdf-link | 527 + erj.ersjournals.com | | 951 + erj.ersjournals.com | redirect-loop | 829 + europepmc.org | | 289 + europepmc.org | success | 283 + figshare.com | | 233 + figshare.com | no-pdf-link | 208 + fjfsdata01prod.blob.core.windows.net | | 1430 + fjfsdata01prod.blob.core.windows.net | success | 1418 + hw.oeaw.ac.at | | 283 + hw.oeaw.ac.at | no-pdf-link | 283 + idb.ub.uni-tuebingen.de | | 216 + idb.ub.uni-tuebingen.de | terminal-bad-status | 215 + ieeexplore.ieee.org | | 7561 + ieeexplore.ieee.org | spn2-error:too-many-redirects | 6732 + ieeexplore.ieee.org | success | 683 + ijgc.bmj.com | | 411 + ijgc.bmj.com | success | 399 + jamanetwork.com | | 229 + jitc.bmj.com | | 849 + jitc.bmj.com | success | 773 + journals.aps.org | | 539 + journals.aps.org | not-found | 534 + journals.lww.com | | 1124 + journals.lww.com | no-pdf-link | 547 + journals.lww.com | link-loop | 399 + journals.openedition.org | | 7366 + journals.openedition.org | success | 3484 + journals.openedition.org | spn2-wayback-error | 2120 + journals.openedition.org | redirect-loop | 1720 + journals.plos.org | | 552 + journals.plos.org | success | 542 + kiss.kstudy.com | | 306 + kiss.kstudy.com | no-pdf-link | 292 + lib.dr.iastate.edu | | 297 + link.springer.com | | 2830 + link.springer.com | redirect-loop | 2625 + linkinghub.elsevier.com | | 970 + linkinghub.elsevier.com | forbidden | 415 + linkinghub.elsevier.com | spn2-error:too-many-redirects | 357 + medrxiv.org | | 287 + medrxiv.org | redirect-loop | 287 + muse.jhu.edu | | 470 + muse.jhu.edu | terminal-bad-status | 443 + ojs.ub.uni-konstanz.de | | 463 + ojs.ub.uni-konstanz.de | success | 462 + onlinelibrary.wiley.com | | 2064 + onlinelibrary.wiley.com | terminal-bad-status | 1973 + osf.io | | 1394 + osf.io | redirect-loop | 589 + osf.io | spn2-wayback-error | 425 + osf.io | not-found | 342 + othes.univie.ac.at | | 424 + othes.univie.ac.at | success | 424 + oxford.universitypressscholarship.com | | 8999 + oxford.universitypressscholarship.com | link-loop | 8282 + oxford.universitypressscholarship.com | no-pdf-link | 695 + oxfordhandbooks.com | redirect-loop | 460 + oxfordhandbooks.com | | 460 + papers.ssrn.com | | 1313 + papers.ssrn.com | link-loop | 1145 + peerj.com | | 313 + peerj.com | no-pdf-link | 212 + periodicos.urca.br | | 446 + periodicos.urca.br | success | 439 + pos.sissa.it | | 277 + pos.sissa.it | success | 262 + preprints.jmir.org | | 242 + pressto.amu.edu.pl | | 260 + pressto.amu.edu.pl | success | 232 + publikationsserver.tu-braunschweig.de | | 15358 + publikationsserver.tu-braunschweig.de | no-pdf-link | 15228 + publons.com | | 2810 + publons.com | redirect-loop | 2359 + publons.com | no-pdf-link | 444 + pubs.acs.org | | 1647 + pubs.acs.org | blocked-cookie | 1553 + pubs.rsc.org | | 765 + pubs.rsc.org | redirect-loop | 486 + pubs.rsc.org | spn2-wayback-error | 214 + res.mdpi.com | | 3620 + res.mdpi.com | success | 3591 + revistas.usal.es | | 580 + revistas.usal.es | success | 298 + revues.imist.ma | | 229 + rsdjournal.org | | 213 + rsdjournal.org | success | 212 + s3-eu-west-1.amazonaws.com | | 764 + s3-eu-west-1.amazonaws.com | success | 763 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | | 324 + s3-euw1-ap-pe-ws4-capi2-distribution-p.s3-eu-west-1.amazonaws.com | success | 324 + saspublishers.com | | 213 + saspublishers.com | success | 213 + scholarshare.temple.edu | | 524 + scholarshare.temple.edu | success | 464 + sol.sbc.org.br | | 268 + sol.sbc.org.br | success | 264 + statisticaldatasets.data-planet.com | | 442 + statisticaldatasets.data-planet.com | no-pdf-link | 390 + watermark.silverchair.com | | 521 + watermark.silverchair.com | success | 514 + www.ahajournals.org | | 1061 + www.ahajournals.org | blocked-cookie | 1011 + www.atlantis-press.com | | 308 + www.atlantis-press.com | spn2-wayback-error | 228 + www.beck-elibrary.de | | 1202 + www.beck-elibrary.de | no-pdf-link | 953 + www.beck-elibrary.de | bad-redirect | 249 + www.cairn.info | | 255 + www.cairn.info | redirect-loop | 208 + www.cambridge.org | | 2061 + www.cambridge.org | no-pdf-link | 727 + www.cambridge.org | success | 485 + www.cambridge.org | link-loop | 388 + www.cambridge.org | bad-redirect | 252 + www.confer.cz | | 227 + www.confer.cz | no-pdf-link | 226 + www.dbpia.co.kr | | 773 + www.dbpia.co.kr | no-pdf-link | 679 + www.degruyter.com | | 17046 + www.degruyter.com | link-loop | 14202 + www.degruyter.com | success | 2201 + www.degruyter.com | not-found | 235 + www.dovepress.com | | 316 + www.dovepress.com | success | 267 + www.e-manuscripta.ch | | 384 + www.e-manuscripta.ch | no-pdf-link | 383 + www.e-periodica.ch | | 358 + www.e-periodica.ch | no-pdf-link | 355 + www.e-rara.ch | no-pdf-link | 279 + www.e-rara.ch | | 279 + www.e3s-conferences.org | | 426 + www.e3s-conferences.org | success | 419 + www.elibrary.ru | | 303 + www.elibrary.ru | no-pdf-link | 301 + www.emerald.com | | 943 + www.emerald.com | no-pdf-link | 933 + www.etasr.com | | 466 + www.etasr.com | success | 466 + www.eurekaselect.com | | 345 + www.eurekaselect.com | no-pdf-link | 321 + www.europeanproceedings.com | | 218 + www.europeanproceedings.com | success | 218 + www.finersistemas.com | success | 397 + www.finersistemas.com | | 397 + www.humankineticslibrary.com | no-pdf-link | 321 + www.humankineticslibrary.com | | 321 + www.ijcmas.com | | 251 + www.ijcmas.com | no-pdf-link | 248 + www.inderscience.com | | 524 + www.inderscience.com | no-pdf-link | 501 + www.ingentaconnect.com | | 366 + www.ingentaconnect.com | no-pdf-link | 349 + www.jstage.jst.go.jp | | 1591 + www.jstage.jst.go.jp | success | 862 + www.jstage.jst.go.jp | no-pdf-link | 567 + www.jstor.org | | 351 + www.karger.com | | 224 + www.liebertpub.com | | 236 + www.liebertpub.com | blocked-cookie | 228 + www.mdpi.com | | 694 + www.mdpi.com | terminal-bad-status | 480 + www.medlit.ru | | 458 + www.medlit.ru | redirect-loop | 366 + www.morressier.com | | 285 + www.morressier.com | no-pdf-link | 253 + www.njca.info | | 223 + www.njca.info | remote-server-error | 222 + www.nomos-elibrary.de | | 913 + www.nomos-elibrary.de | no-pdf-link | 379 + www.nomos-elibrary.de | bad-redirect | 265 + www.nomos-elibrary.de | link-loop | 236 + www.onepetro.org | | 895 + www.onepetro.org | redirect-loop | 853 + www.osti.gov | | 212 + www.persee.fr | | 232 + www.persee.fr | terminal-bad-status | 213 + www.repository.cam.ac.uk | | 439 + www.research-collection.ethz.ch | | 312 + www.research-collection.ethz.ch | terminal-bad-status | 310 + www.revistas.ufg.br | | 212 + www.schoeningh.de | | 371 + www.schoeningh.de | link-loop | 366 + www.scialert.net | | 276 + www.scialert.net | redirect-loop | 254 + www.scielo.br | | 644 + www.scielo.br | success | 624 + www.sciencedirect.com | | 6523 + www.sciencedirect.com | no-pdf-link | 4668 + www.sciencedirect.com | terminal-bad-status | 1737 + www.scitepress.org | no-pdf-link | 397 + www.scitepress.org | | 397 + www.tandfonline.com | | 3448 + www.tandfonline.com | blocked-cookie | 2446 + www.tandfonline.com | terminal-bad-status | 714 + www.taylorfrancis.com | | 21292 + www.taylorfrancis.com | link-loop | 18648 + www.taylorfrancis.com | forbidden | 2022 + www.taylorfrancis.com | terminal-bad-status | 518 + www.thieme-connect.de | | 513 + www.thieme-connect.de | not-found | 292 + www.thieme-connect.de | redirect-loop | 213 + www.whateveryoneneedstoknow.com | | 1174 + www.whateveryoneneedstoknow.com | redirect-loop | 1163 + www.worldscientific.com | | 293 + www.worldscientific.com | blocked-cookie | 240 + www.zora.uzh.ch | | 290 + www.zora.uzh.ch | redirect-loop | 278 + zenodo.org | | 22202 + zenodo.org | terminal-bad-status | 12158 + zenodo.org | success | 4923 + zenodo.org | no-pdf-link | 4788 + | | 280719 + | success | 85143 + | no-pdf-link | 61335 + | link-loop | 48566 + | redirect-loop | 26845 + | terminal-bad-status | 23955 + | spn2-wayback-error | 7920 + | spn2-error:too-many-redirects | 7175 + | blocked-cookie | 6980 + | forbidden | 2912 + | bad-redirect | 2666 + | spn2-error | 1943 + | not-found | 1762 + | spn2-cdx-lookup-failure | 1376 + | wrong-mimetype | 467 + | remote-server-error | 388 + | spn2-error:proxy-error | 295 + | no-capture | 262 +(304 rows) diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000..a9f5260 --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000..d68895b --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql new file mode 100644 index 0000000..a085480 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/down.sql @@ -0,0 +1,8 @@ + +DROP TABLE IF NOT EXISTS cdx; +DROP TABLE IF NOT EXISTS file_meta; +DROP TABLE IF NOT EXISTS fatcat_file; +DROP TABLE IF NOT EXISTS petabox; +DROP TABLE IF NOT EXISTS grobid; +DROP TABLE IF NOT EXISTS ingest_request; +DROP TABLE IF NOT EXISTS shadow; diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql new file mode 100644 index 0000000..33dba66 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -0,0 +1,245 @@ + +-- rows *may* be revisit records; indicated by mimetype == "warc/revisit" +-- records are implied to be 200 status (or 226 for ftp); either direct hits or +-- revisits +-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many +-- datetimes. import scripts should take efforts to reduce this sort of +-- duplication though. one row per *domain*/sha1hex pair is a good guideline. +-- all ingest result url/dt pairs should be included though. +-- any mimetype is allowed, but presumption should be that actual body is full +-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each +-- only a part of work). URLs that are parts of a fileset are allowed. +CREATE TABLE IF NOT EXISTS cdx ( + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + -- sha1hex/cdx_sha1hex difference is intended to help with difference between + -- CDX hash (which is transport encoded body) vs. actual body. Probably need to + -- include both for all records? + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + -- TODO: enforce that only paths with '/' (item+file) should be included? + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_csize BIGINT, + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +-- TODO: remove this index? not currently used +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +-- TODO: require all fields. if mimetype unknown, should be octet-stream +CREATE TABLE IF NOT EXISTS file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); +CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); + +CREATE TABLE IF NOT EXISTS fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26), + any_url BOOLEAN, + content_scope TEXT CHECK (octet_length(content_scope) >= 1) +); + +CREATE TABLE IF NOT EXISTS petabox ( + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + PRIMARY KEY(item, path) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE IF NOT EXISTS grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + -- extracted basic biblio metadata: + -- title + -- authors[] + -- full/display + -- given_name + -- surname + -- affiliation + -- year + -- journal_issn + -- journal_name + -- refs_count + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL +); + +CREATE TABLE IF NOT EXISTS pdf_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, + page_count INT CHECK (page_count >= 0), + word_count INT CHECK (word_count >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), + permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), + pdf_created TIMESTAMP WITH TIME ZONE, + pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), + metadata JSONB + -- maybe some analysis of available fields? + -- metadata JSON fields: + -- title + -- subject + -- author + -- creator + -- producer + -- CrossMarkDomains + -- doi + -- form + -- encrypted +); + +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + biblio JSONB, + resources JSONB + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + +CREATE TABLE IF NOT EXISTS ingest_request ( + link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), + link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1), + created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + release_stage TEXT CHECK (octet_length(release_stage) >= 1), + request JSONB, + -- request isn't required, but can stash extra fields there for import, eg: + -- ext_ids (source/source_id sometimes enough) + -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) + -- edit_extra + -- ingest type can be: pdf, xml, html + + PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) +); +CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); +CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created); + +CREATE TABLE IF NOT EXISTS ingest_file_result ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), + terminal_status_code INT, + terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40), + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); +CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); + +CREATE TABLE IF NOT EXISTS ingest_fileset_platform ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + + platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1), + platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1), + platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1), + ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1), + total_size BIGINT, + file_count BIGINT, + archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1), + + archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1), + web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1), + web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14), + + manifest JSONB, + -- list, similar to fatcat fileset manifest, plus extra: + -- status (str) + -- path (str) + -- size (int) + -- md5 (str) + -- sha1 (str) + -- sha256 (str) + -- mimetype (str) + -- extra (dict) + -- platform_url (str) + -- terminal_url (str) + -- terminal_dt (str) + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id); + +CREATE TABLE IF NOT EXISTS shadow ( + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); + +CREATE TABLE IF NOT EXISTS grobid_refs ( + source TEXT NOT NULL CHECK (octet_length(source) >= 1), + source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1), + source_ts TIMESTAMP WITH TIME ZONE, + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + refs_json JSON NOT NULL, + PRIMARY KEY(source, source_id) +); + +CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS + SELECT + crossref.doi as doi, + crossref.indexed as indexed, + crossref.record as record, + grobid_refs.source_ts as source_ts, + grobid_refs.refs_json as refs_json + FROM crossref + LEFT JOIN grobid_refs ON + grobid_refs.source_id = crossref.doi + AND grobid_refs.source = 'crossref'; diff --git a/sql/monitoring_queries.md b/sql/monitoring_queries.md new file mode 100644 index 0000000..0859e79 --- /dev/null +++ b/sql/monitoring_queries.md @@ -0,0 +1,202 @@ + +## fatcat-changelog pipeline + +Overall ingest status, past 30 days: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 20; + +Broken domains, past 30 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '30 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Summary of significant domains and status, past 7 days: + + SELECT domain, status, count + FROM ( + SELECT domain, status, COUNT((domain, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + ) t1 + WHERE t1.domain != '' + GROUP BY CUBE (domain, status) + ) t2 + WHERE count > 200 + ORDER BY domain ASC , count DESC; + +Summary of DOI prefix and status, past 7 days: + + SELECT doi_prefix, status, count + FROM ( + SELECT doi_prefix, status, COUNT((doi_prefix, status)) as count + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_request.link_source_id FROM '(10\.[^/]*)/.*') AS doi_prefix + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_file_result.updated >= NOW() - '7 day'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + AND ingest_request.link_source = 'doi' + ) t1 + WHERE t1.doi_prefix != '' + GROUP BY CUBE (doi_prefix, status) + ) t2 + WHERE count > 200 + ORDER BY doi_prefix ASC , count DESC; + + +Throughput per day, and success, for past 30 days: + + SELECT ingest_request.ingest_type, + date(ingest_request.created), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '1 month'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-changelog' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_request.created) + ORDER BY date(ingest_request.created) DESC; + +## fatcat-ingest + +Broken domains, past 7 days: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '7 day'::INTERVAL + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + +Throughput per day, and success, for past 7 days: + + SELECT ingest_request.ingest_type, + date(ingest_file_result.updated), + COUNT(*) as total, + COUNT(CASE ingest_file_result.status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_request.created >= NOW() - '7 day'::INTERVAL + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.ingest_type, date(ingest_file_result.updated) + ORDER BY date(ingest_file_result.updated) DESC; + +Overall status, updated requests past 3 days: + + SELECT ingest_request.ingest_type, + ingest_file_result.status, + COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + -- ingest_file_result.updated >= NOW() - '3 day'::INTERVAL + ingest_file_result.updated >= NOW() - '48 hour'::INTERVAL + AND ingest_request.ingest_type = 'pdf' + AND ingest_request.ingest_request_source = 'fatcat-ingest' + GROUP BY ingest_request.ingest_type, ingest_file_result.status + ORDER BY COUNT(*) DESC; + +## savepapernow and fatcat-ingest recent status + +Specific recent ingests (for debugging): + + -- for record layout: \x + SELECT + ingest_file_result.status as status, + ingest_request.ingest_type as ingest_type, + ingest_request.ingest_request_source as source, + ingest_request.link_source_id as source_id, + ingest_request.base_url as base_url, + ingest_file_result.terminal_dt as dt, + ingest_file_result.terminal_status_code as status_code, + ingest_file_result.terminal_sha1hex as sha1hex, + grobid.status as grobid_status + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + LEFT JOIN grobid + ON ingest_file_result.terminal_sha1hex = grobid.sha1hex + WHERE + ingest_file_result.updated >= NOW() - '24 hour'::INTERVAL + -- AND ingest_request.ingest_type = 'pdf' + -- AND ingest_request.ingest_type = 'html' + AND ( + ingest_request.ingest_request_source = 'savepapernow-web' + -- OR ingest_request.ingest_request_source = 'fatcat-ingest' + ) + ORDER BY ingest_file_result.updated DESC + LIMIT 100; + diff --git a/sql/pdftrio_queries.md b/sql/pdftrio_queries.md new file mode 100644 index 0000000..06f718c --- /dev/null +++ b/sql/pdftrio_queries.md @@ -0,0 +1,65 @@ + +## Counts / Status + + SELECT status_code, COUNT(*) FROM pdftrio GROUP BY status_code; + + # NOTE: I earlier deleted a large fraction of non-200 status codes, so + # these aren't representative + status_code | count + -------------+--------- + -4 | 16 + -2 | 26 + 200 | 1117501 + 400 | 2695 + (4 rows) + + + SELECT status, COUNT(*) FROM pdftrio GROUP BY status; + + status | count + ---------------+--------- + error | 2696 + error-connect | 26 + error-timeout | 16 + success | 1118252 + (4 rows) + + SELECT + COUNT(CASE WHEN ensemble_score IS NOT NULL THEN 1 ELSE NULL END) as ensemble_count, + COUNT(CASE WHEN linear_score IS NOT NULL THEN 1 ELSE NULL END) as linear_count, + COUNT(CASE WHEN bert_score IS NOT NULL THEN 1 ELSE NULL END) as bert_count, + COUNT(CASE WHEN image_score IS NOT NULL THEN 1 ELSE NULL END) as image_count + FROM pdftrio; + + + ensemble_count | linear_count | bert_count | image_count + ----------------+--------------+------------+------------- + 1120100 | 976271 | 66209 | 143829 + (1 row) + +## Histograms + + SELECT width_bucket(ensemble_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND ensemble_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(bert_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND bert_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(linear_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND linear_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + + SELECT width_bucket(image_score * 100, 0.0, 100.0, 19) * 5 as buckets, count(*) FROM pdftrio + WHERE status = 'success' + AND image_score IS NOT NULL + GROUP BY buckets + ORDER BY buckets; + diff --git a/sql/random_queries.md b/sql/random_queries.md new file mode 100644 index 0000000..572b4f9 --- /dev/null +++ b/sql/random_queries.md @@ -0,0 +1,193 @@ + +Basic stats (2019-09-23): + + SELECT COUNT(*) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 28,023,760 + => Time: 253897.213 ms (04:13.897) + + SELECT COUNT(DISTINCT sha1hex) FROM cdx WHERE NOT EXISTS (SELECT grobid.sha1hex FROM grobid WHERE cdx.sha1hex = grobid.sha1hex); + => 22,816,087 + => Time: 287097.944 ms (04:47.098) + + SELECT COUNT(*) FROM grobid. + => 56,196,992 + + SELECT COUNT(DISTINCT sha1hex) FROM cdx; + => 64,348,277 + => Time: 572383.931 ms (09:32.384) + + SELECT COUNT(*) FROM cdx; + => 74,796,777 + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC; + => Time: 189067.335 ms (03:09.067) + + mimetype | count + ------------------------+---------- + application/pdf | 51049905 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +Time: 189067.335 ms (03:09.067) + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY count(*) DESC; + + status_code | count + -------------+---------- + 200 | 56196992 + + compare with older sandcrawler/output-prod/2019-05-28-1920.35-statuscodecount: + + 200 49567139 + 400 3464503 + 409 691917 + 500 247028 + 503 123 + + SELECT row_to_json(cdx) FROM cdx LIMIT 5; + + SELECT row_to_json(r) FROM ( + SELECT url, datetime FROM cdx + ) r + LIMIT 5; + +More stats (2019-12-27): + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 20; + + SELECT SUM(size_bytes) FROM file_meta; + +"Last 24 hour progress": + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + +## Parse URLs + +One approach is to do regexes, something like: + + SELECT substring(column_name FROM '[^/]+://([^/]+)/') AS domain_name FROM table_name; + +Eg: + + SELECT DISTINCT(domain), COUNT(domain) + FROM (select substring(base_url FROM '[^/]+://([^/]*)') as domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + GROUP BY domain + ORDER BY COUNT DESC + LIMIT 10; + +Or: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + +Can also do some quick lookups for a specific domain and protocol like: + + SELECT * + FROM ingest_file_result + WHERE terminal_url LIKE 'https://insights.ovid.com/%' + LIMIT 10; + +For a given DOI prefix: + + SELECT * + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/a%' + AND status = 'no-pdf-link' + LIMIT 10; + + SELECT status, count(*) + FROM ingest_file_result + WHERE base_url LIKE 'https://doi.org/10.17223/%' + GROUP BY status + ORDER BY count(*) DESC; + +## Bulk Ingest + +Show bulk ingest status on links *added* in the past week: + + SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '30 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + GROUP BY ingest_file_result.ingest_type, ingest_file_result.status + ORDER BY COUNT DESC + LIMIT 25; + +Top *successful* domains: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; + +Summarize non-success domains for the same: + + SELECT domain, status, COUNT((domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE ingest_request.created >= NOW() - '7 day'::INTERVAL + AND ingest_request.link_source = 'unpaywall' + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 20; diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh new file mode 100755 index 0000000..d39a171 --- /dev/null +++ b/sql/reingest_bulk.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_bulk_current.json + +cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh new file mode 100755 index 0000000..96e5416 --- /dev/null +++ b/sql/reingest_old.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_old.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_old_current.json + +cat /srv/sandcrawler/tasks/reingest_old_current.json \ + | shuf \ + | head -n1000000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh new file mode 100755 index 0000000..8a2996c --- /dev/null +++ b/sql/reingest_quarterly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_quarterly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_quarterly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_quarterly_current.json + +cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \ + | shuf \ + | head -n120000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh new file mode 100755 index 0000000..c693a64 --- /dev/null +++ b/sql/reingest_spn.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_spn.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_spn.rows.json \ + > /srv/sandcrawler/tasks/reingest_spn.json + +cat /srv/sandcrawler/tasks/reingest_spn.json \ + | shuf \ + | head -n60000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 + diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh new file mode 100755 index 0000000..5cb6d51 --- /dev/null +++ b/sql/reingest_terminalstatus_forcerecrawl.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json + +cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \ + | shuf \ + | head -n100000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh new file mode 100755 index 0000000..d2e2444 --- /dev/null +++ b/sql/reingest_weekly.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e # fail on error +set -u # fail if variable not set in substitution +set -o pipefail # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_weekly.sql + +cd ../python +sudo -u sandcrawler pipenv run \ + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_weekly_current.rows.json \ + > /srv/sandcrawler/tasks/reingest_weekly_current.json + +cat /srv/sandcrawler/tasks/reingest_weekly_current.json \ + | shuf \ + | head -n80000 \ + | jq . -c \ + | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql index fd921ed..a3756d4 100644..120000 --- a/sql/sandcrawler_schema.sql +++ b/sql/sandcrawler_schema.sql @@ -1,59 +1 @@ - -CREATE TABLE IF NOT EXISTS cdx ( - url TEXT NOT NULL CHECK (octet_length(url) >= 1), - datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), - mimetype TEXT CHECK (octet_length(mimetype) >= 1), - warc_path TEXT CHECK (octet_length(warc_path) >= 1), - warc_csize BIGINT, - warc_offset BIGINT, - row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, - PRIMARY KEY(url, datetime) -); -CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); -CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); - -CREATE TABLE IF NOT EXISTS file_meta ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - sha256hex TEXT CHECK (octet_length(sha256hex) = 64), - md5hex TEXT CHECK (octet_length(md5hex) = 32), - size_bytes BIGINT, - mimetype TEXT CHECK (octet_length(mimetype) >= 1) -); - -CREATE TABLE IF NOT EXISTS fatcat_file ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - file_ident TEXT CHECK (octet_length(file_ident) = 26), - first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) -); - -CREATE TABLE IF NOT EXISTS petabox ( - item TEXT NOT NULL CHECK (octet_length(item) >= 1), - path TEXT NOT NULL CHECK (octet_length(path) >= 1), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - PRIMARY KEY(item, path) -); -CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); - -CREATE TABLE IF NOT EXISTS grobid ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, - grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), - status_code INT NOT NULL, - status TEXT CHECK (octet_length(status) >= 1), - fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), - metadata JSONB -); --- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); - -CREATE TABLE IF NOT EXISTS shadow ( - shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), - shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - doi TEXT CHECK (octet_length(doi) >= 1), - pmid TEXT CHECK (octet_length(pmid) >= 1), - isbn13 TEXT CHECK (octet_length(isbn13) >= 1), - PRIMARY KEY(shadow_corpus, shadow_id) -); -CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); +migrations/2019-12-19-060141_init/up.sql
\ No newline at end of file diff --git a/sql/stats/2020-01-13_stats.txt b/sql/stats/2020-01-13_stats.txt new file mode 100644 index 0000000..444e448 --- /dev/null +++ b/sql/stats/2020-01-13_stats.txt @@ -0,0 +1,190 @@ + +## SQL Table Sizes + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid" | 38 GB | 7076 MB | 45 GB + "public"."file_meta" | 23 GB | 11 GB | 34 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 566 MB | 749 MB | 1314 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."ingest_request" | 363 MB | 625 MB | 988 MB + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + + total_count | total_size + -------------+----------------- + 118823340 | 140917467253923 + (1 row) + + # 118,823,340 => 118 million + # 140,917,467,253,923 => ~141 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 117185567 + | 1509149 + application/octet-stream | 87783 + text/html | 9901 + application/postscript | 3781 + application/vnd.ms-powerpoint | 1421 + text/plain | 1151 + application/xml | 427 + application/gzip | 414 + application/msword | 314 + (10 rows) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96141851 | 110030179 + (1 row) + + # 96,141,851 + # 110,030,179 + +Top mimetypes (not unique by sha1): + + mimetype | count + ------------------------+---------- + application/pdf | 84582642 + text/html | 24841846 + text/xml | 524682 + application/postscript | 81009 + (4 rows) + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 13675190 | 59919772 + + # 13,675,190 + # 59,919,772 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 57382904 + 500 | 2536862 + 503 | 6 + (3 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 41699385 + | 15683279 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + # 2,868,825 + # 2,887,834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 2816171 + pdf | arxiv | 154448 + pdf | spn | 55 + pdf | pubmed | 2 + (4 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+------- + (0 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+--------+---------- + pdf | doi | 2816171 | 289199 | 0.103 + pdf | arxiv | 154448 | 41105 | 0.266 + pdf | spn | 55 | 46 | 0.836 + pdf | pubmed | 2 | 0 | 0.000 + (4 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+---------------------+--------- + pdf | no-pdf-link | 2213720 + pdf | success | 330492 + pdf | spn-remote-error | 182157 + pdf | spn-error | 141222 + pdf | cdx-error | 83131 + pdf | link-loop | 11350 + pdf | other-mimetype | 6089 + pdf | null-body | 1980 + pdf | terminal-bad-status | 583 + pdf | wayback-error | 381 + (10 rows) + diff --git a/sql/stats/2020-01-31_supplement.txt b/sql/stats/2020-01-31_supplement.txt new file mode 100644 index 0000000..6bd43ea --- /dev/null +++ b/sql/stats/2020-01-31_supplement.txt @@ -0,0 +1,42 @@ + +How many file_meta still missing core metadata? + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + => 1,130,915 + +Great! Not many. + +And are in petabox? + + SELECT COUNT(*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL; + => 1,149,194 + +Almost all; maybe just some CDX fetch failures or something in there. So, +should run these on, eg, grobid2-vm. + + COPY ( + SELECT row_to_json(petabox.*) + FROM file_meta + LEFT JOIN petabox ON file_meta.sha1hex = petabox.sha1hex + WHERE file_meta.sha256hex IS NULL + AND file_meta.sha1hex IS NOT NULL + ) TO '/grande/snapshots/dump_grobid_petabox_todo.json'; + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file` (note: `fatcat_file` is out of date by a +couple million files): + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | count + -------------+--------- + 5072452 | 4130405 + diff --git a/sql/stats/2020-02-24_stats.txt b/sql/stats/2020-02-24_stats.txt new file mode 100644 index 0000000..e7a00e8 --- /dev/null +++ b/sql/stats/2020-02-24_stats.txt @@ -0,0 +1,482 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + Size: 271.83G + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 36 GB | 78 GB + "public"."grobid_shadow" | 61 GB | 6553 MB | 68 GB + "public"."grobid" | 47 GB | 7213 MB | 54 GB + "public"."file_meta" | 26 GB | 12 GB | 38 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."ingest_file_result" | 1831 MB | 2454 MB | 4285 MB + "public"."ingest_request" | 2006 MB | 2122 MB | 4128 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + "public"."pdftrio" | 78 MB | 64 MB | 142 MB + (10 rows) + + +## File Metadata + +(skipping, no update) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + unique_releases | total + -----------------+---------- + 15,632,810 | 76,555,791 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 70656028 + 500 | 5896836 + -4 | 2295 + 503 | 111 + (4 rows) + + What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 56001631 + | 14654496 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2,868,825 | 2,887,834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591633 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + pdf | pubmed | 2 + (6 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | | 2943896 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | | 629719 + pdf | doi | fatcat-changelog | 129932 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | arxiv | fatcat-ingest | 998 + pdf | spn | | 64 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | spn | savepapernow-web | 18 + pdf | pubmed | | 2 + pdf | doi | savepapernow-web | 1 + (14 rows) + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-changelog' WHERE ingest_type = 'pdf' AND link_source = 'doi' AND ingest_request_source IS NULL; + => UPDATE 2943896 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'savepapernow-web' WHERE ingest_type = 'pdf' AND link_source = 'spn' AND ingest_request_source IS NULL; + => UPDATE 64 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'arxiv' AND ingest_request_source IS NULL; + => UPDATE 629719 + + SELECT count(*) FROM ingest_request WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + UPDATE ingest_request SET ingest_request_source = 'fatcat-ingest' WHERE ingest_type = 'pdf' AND link_source = 'pmc' AND ingest_request_source IS NULL; + => UPDATE 1454 + + SELECT count(*) FROM ingest_request WHERE link_source = 'pubmed'; + DELETE FROM ingest_request WHERE link_source = 'pubmed'; + => DELETE 2 + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doi | 6591637 + pdf | pmc | 2030279 + pdf | arxiv | 630743 + pdf | unpaywall | 1400 + pdf | spn | 82 + (5 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-------------+-------------------------+--------- + pdf | doi | fatcat-ingest-container | 3515873 + pdf | doi | fatcat-changelog | 3073828 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630717 + pdf | doi | fatcat-ingest | 1935 + pdf | pmc | fatcat-ingest | 1454 + pdf | unpaywall | unpaywall | 1400 + pdf | spn | savepapernow-web | 82 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 1 + (10 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + none? + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-------------+----------+---------+---------- + pdf | doi | 6591637 | 1622702 | 0.246 + pdf | pmc | 2030279 | 1241836 | 0.612 + pdf | arxiv | 630743 | 500620 | 0.794 + pdf | unpaywall | 1400 | 851 | 0.608 + pdf | spn | 82 | 62 | 0.756 + (5 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------------+--------- + pdf | success | 3366189 + pdf | no-pdf-link | 2902620 + pdf | no-capture | 1672025 + pdf | redirect-loop | 388844 + pdf | cdx-error | 272780 + pdf | terminal-bad-status | 171878 + pdf | spn-remote-error | 163843 + pdf | spn-error | 108070 + pdf | null-body | 66778 + pdf | link-loop | 43403 + pdf | skip-url-blocklist | 34705 + pdf | wrong-mimetype | 31343 + pdf | wayback-error | 13012 + pdf | spn2-cdx-lookup-failure | 6100 + pdf | gateway-timeout | 5633 + pdf | other-mimetype | 5114 + pdf | spn2-error:proxy-error | 538 + pdf | spn2-error:job-failed | 470 + pdf | petabox-error | 415 + pdf | spn2-error:browser-running-error | 136 + pdf | spn2-error | 127 + pdf | spn2-error:soft-time-limit-exceeded | 71 + pdf | bad-redirect | 39 + pdf | spn2-error:unknown | 30 + pdf | spn2-error:browsing-timeout | 25 + pdf | pending | 3 + pdf | invalid-host-resolution | 1 + (27 rows) + + +## Fatcat Files + +(skipping, no update) + +## Recent Success/Failure of Ingest by Domain + +NOTE: just finished a bunch of "backfill" ingest from OA-DOI crawl; only a +small fraction of this is from changelog. + + # "problem domains" and statuses + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + AND t1.updated >= NOW() - '1 day'::INTERVAL + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 10; + + domain | status | count + -------------------------+----------------+------- + linkinghub.elsevier.com | no-capture | 2579 + www.mdpi.com | wrong-mimetype | 1313 + onlinelibrary.wiley.com | no-pdf-link | 785 + americanarchivist.org | no-pdf-link | 756 + journals.sagepub.com | redirect-loop | 503 + link.springer.com | redirect-loop | 432 + iopscience.iop.org | no-capture | 392 + www.tandfonline.com | no-pdf-link | 389 + pubs.rsc.org | no-capture | 361 + www.persee.fr | no-capture | 344 + (10 rows) + + + # "what type of errors" + SELECT ingest_type, status, COUNT(*) + FROM ingest_file_result + WHERE updated >= NOW() - '1 day'::INTERVAL + GROUP BY ingest_type, status + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+------- + pdf | success | 40578 + pdf | cdx-error | 14982 + pdf | no-capture | 7747 + pdf | no-pdf-link | 7111 + pdf | redirect-loop | 3265 + pdf | wrong-mimetype | 1629 + pdf | spn2-cdx-lookup-failure | 657 + pdf | link-loop | 538 + pdf | null-body | 517 + pdf | terminal-bad-status | 400 + pdf | wayback-error | 79 + pdf | spn2-error:job-failed | 53 + pdf | gateway-timeout | 38 + pdf | spn2-error:soft-time-limit-exceeded | 7 + pdf | spn2-error | 6 + pdf | petabox-error | 5 + pdf | spn2-error:browsing-timeout | 4 + pdf | spn2-error:unknown | 2 + pdf | bad-redirect | 1 + pdf | pending | 1 + (20 rows) + + # "throughput per day for last N days" + SELECT ingest_type, + date(updated), + COUNT(*) as total, + COUNT(CASE status WHEN 'success' THEN 1 ELSE null END) as success + FROM ingest_file_result + WHERE updated >= NOW() - '1 month'::INTERVAL + GROUP BY ingest_type, date(updated) + ORDER BY date(updated) DESC; + + ingest_type | date | total | success + -------------+------------+---------+--------- + pdf | 2020-02-25 | 32660 | 14322 + pdf | 2020-02-24 | 44967 | 26263 + pdf | 2020-02-23 | 58795 | 18874 + pdf | 2020-02-22 | 844249 | 272606 + pdf | 2020-02-21 | 1287378 | 433487 + pdf | 2020-02-20 | 1455943 | 492408 + pdf | 2020-02-19 | 21453 | 7529 + pdf | 2020-02-18 | 5863 | 2926 + pdf | 2020-02-17 | 3737 | 970 + pdf | 2020-02-16 | 13779 | 4862 + pdf | 2020-02-15 | 1021020 | 623020 + pdf | 2020-02-14 | 1036036 | 632830 + pdf | 2020-02-13 | 13503 | 5824 + pdf | 2020-02-12 | 20078 | 11422 + pdf | 2020-02-11 | 13499 | 6781 + pdf | 2020-02-10 | 2275 | 961 + pdf | 2020-02-09 | 3231 | 1494 + pdf | 2020-02-08 | 8967 | 4400 + pdf | 2020-02-07 | 7022 | 2430 + pdf | 2020-02-06 | 1291 | 516 + pdf | 2020-02-05 | 8586 | 6596 + pdf | 2020-02-04 | 3681 | 3593 + pdf | 2020-02-03 | 284 | 284 + pdf | 2020-02-02 | 480 | 480 + pdf | 2020-02-01 | 489 | 336 + pdf | 2020-01-31 | 1187 | 1130 + pdf | 2020-01-30 | 1613 | 1288 + pdf | 2020-01-29 | 947 | 279 + pdf | 2020-01-28 | 667 | 323 + (29 rows) + +Top "no-capture" domains (will need to re-ingest using live tool): + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-capture' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + --------------------------+------------+-------- + linkinghub.elsevier.com | no-capture | 320065 + iopscience.iop.org | no-capture | 46858 + pubs.rsc.org | no-capture | 43331 + www.persee.fr | no-capture | 38971 + www.doiserbia.nb.rs | no-capture | 27112 + academic.oup.com | no-capture | 18877 + www.osapublishing.org | no-capture | 17113 + osf.io | no-capture | 16978 + scripts.iucr.org | no-capture | 14844 + www.degruyter.com | no-capture | 8093 + mab-online.nl | no-capture | 6603 + insights.ovid.com | no-capture | 6457 + ir.lib.uth.gr | no-capture | 3625 + www.sciencedirect.com | no-capture | 3244 + www.tandfonline.com | no-capture | 3201 + www.ccsenet.org | no-capture | 2849 + www.intechopen.com | no-capture | 2813 + primary-hospital-care.ch | no-capture | 2774 + www.nature.com | no-capture | 2484 + www.indianjournals.com | no-capture | 2432 + journals.aps.org | no-capture | 2197 + journals.sagepub.com | no-capture | 2064 + www.episodes.org | no-capture | 1805 + periodicos.uninove.br | no-capture | 1692 + escholarship.org | no-capture | 1666 + (25 rows) + +Top "no-pdf-link" domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND t1.status = 'no-pdf-link' + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + -----------------------------+-------------+-------- + plutof.ut.ee | no-pdf-link | 685315 + www.gbif.org | no-pdf-link | 670647 + doi.pangaea.de | no-pdf-link | 301984 + www.plate-archive.org | no-pdf-link | 209218 + onlinelibrary.wiley.com | no-pdf-link | 84890 + figshare.com | no-pdf-link | 72892 + zenodo.org | no-pdf-link | 45768 + www.tandfonline.com | no-pdf-link | 43848 + data.mendeley.com | no-pdf-link | 42367 + springernature.figshare.com | no-pdf-link | 35941 + dhz.uni-passau.de | no-pdf-link | 29187 + www.frontiersin.org | no-pdf-link | 17925 + digital.ucd.ie | no-pdf-link | 16769 + mr.crossref.org | no-pdf-link | 14999 + journals.lww.com | no-pdf-link | 12122 + musewide.aip.de | no-pdf-link | 10854 + datadryad.org | no-pdf-link | 10686 + www.jstor.org | no-pdf-link | 9159 + koreascience.or.kr | no-pdf-link | 9067 + easy.dans.knaw.nl | no-pdf-link | 8264 + scielo.conicyt.cl | no-pdf-link | 8069 + www.degruyter.com | no-pdf-link | 7989 + www.kci.go.kr | no-pdf-link | 6990 + www.m-hikari.com | no-pdf-link | 6941 + cshprotocols.cshlp.org | no-pdf-link | 6553 + (25 rows) + +Top block-ish domains: + + SELECT domain, status, COUNT((domain, status)) + FROM (SELECT status, updated, substring(terminal_url FROM '[^/]+://([^/]*)') AS domain FROM ingest_file_result) t1 + WHERE t1.domain != '' + AND (t1.status = 'redirect-loop' OR t1.status = 'link-loop' OR t1.status = 'terminal-bad-status') + GROUP BY domain, status + ORDER BY COUNT DESC + LIMIT 25; + + domain | status | count + ---------------------------------+---------------------+------- + journals.openedition.org | redirect-loop | 30395 + ieeexplore.ieee.org | redirect-loop | 28926 + www.degruyter.com | redirect-loop | 18891 + www.cairn.info | link-loop | 8919 + www.frontiersin.org | terminal-bad-status | 6786 + projecteuclid.org | link-loop | 6098 + www.mdpi.com | terminal-bad-status | 5189 + medicalforum.ch | terminal-bad-status | 4596 + jrnl.nau.edu.ua | link-loop | 4238 + www.revistas.unam.mx | link-loop | 3926 + journals.aps.org | redirect-loop | 3696 + www.ijcseonline.org | redirect-loop | 3567 + www.researchsquare.com | terminal-bad-status | 3453 + www.persee.fr | terminal-bad-status | 3221 + www.baltistica.lt | link-loop | 2098 + osf.io | redirect-loop | 2004 + seer.ufrgs.br | terminal-bad-status | 2002 + jtd.amegroups.com | link-loop | 1738 + www.hindawi.com | terminal-bad-status | 1613 + linkinghub.elsevier.com | redirect-loop | 1612 + www.scienceopen.com | terminal-bad-status | 1580 + atm.amegroups.com | link-loop | 1571 + scielo.conicyt.cl | terminal-bad-status | 1491 + repozytorium.ur.edu.pl | redirect-loop | 1279 + agupubs.onlinelibrary.wiley.com | link-loop | 1182 + (25 rows) + diff --git a/sql/stats/2020-05-03_stats.txt b/sql/stats/2020-05-03_stats.txt new file mode 100644 index 0000000..55f0c1e --- /dev/null +++ b/sql/stats/2020-05-03_stats.txt @@ -0,0 +1,418 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 41 GB | 82 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 59 GB | 7604 MB | 66 GB + "public"."file_meta" | 31 GB | 28 GB | 59 GB + "public"."ingest_request" | 19 GB | 20 GB | 39 GB + "public"."ingest_file_result" | 15 GB | 23 GB | 39 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (10 rows) + + Size: 383.93G + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 158059828 | 197346217653010 + (1 row) + + => 158 million, 197 terabytes + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 157805029 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6040 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1027125 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + + unique_sha1 | total + -------------+----------- + 92936564 | 111022039 + (1 row) + + => 110 million rows, 92.9 million files + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------------------------------------------------------------+----------- + application/pdf | 104178718 + warc/revisit | 5274410 + text/xml | 519042 + text/html | 295523 + application/octet-stream | 259681 + unk | 138930 + application/postscript | 81065 + application/save | 80765 + binary/octet-stream | 59804 + application/x-download | 27083 + text/plain | 26938 + application/download | 25125 + image/pdf | 16095 + application/force-download | 9004 + application/x-msdownload | 3711 + application | 2934 + application/x-octetstream | 2926 + multipart/form-data | 2741 + application/x-pdf | 2444 + .pdf | 2368 + application/binary | 1268 + application/pdf' | 1192 + pdf | 1113 + file/unknown | 1086 + application/unknown | 761 + file | 753 + application/blob | 670 + application/octetstream | 657 + text/pdf | 549 + 0 | 417 + ('application/pdf', | 349 + application/http;msgtype=response | 251 + application/doc | 180 + [...] (wasn't LIMIT 25) + +Processed or not: + + # TODO: + +## GROBID + +Counts: + + SELECT COUNT(DISTINCT fatcat_release) AS unique_releases, COUNT(*) AS total FROM grobid; + + + unique_releases | total + -----------------+---------- + 17455441 | 92707544 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057023 + | 14638425 + (2 rows) + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status = 'success' GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 71057074 + | 3 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | unpaywall | 26244088 + pdf | mag | 25596658 + pdf | doi | 15652966 + pdf | pmc | 2043646 + pdf | arxiv | 721902 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 103 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | unpaywall | unpaywall | 26244088 + pdf | mag | mag-corpus | 25596658 + pdf | doi | fatcat-ingest | 8267308 + pdf | doi | fatcat-changelog | 3869772 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630719 + pdf | arxiv | fatcat-changelog | 91157 + pdf | pmc | fatcat-ingest | 10195 + pdf | pmc | fatcat-changelog | 4626 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 103 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 15 + (15 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+------- + pdf | mag | 47 + pdf | unpaywall | 1 + (2 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 25; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | unpaywall | 26244088 | 19968092 | 0.761 + pdf | mag | 25596658 | 18712912 | 0.731 + pdf | doi | 15653166 | 2878833 | 0.184 + pdf | pmc | 2043646 | 1279529 | 0.626 + pdf | arxiv | 721902 | 592394 | 0.821 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 103 | 82 | 0.796 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 37449502 + pdf | no-pdf-link | 10908442 + pdf | no-capture | 5643670 + pdf | redirect-loop | 4823502 + pdf | terminal-bad-status | 1715056 + pdf | link-loop | 1425072 + pdf | cdx-error | 535365 + pdf | gateway-timeout | 267654 + pdf | skip-url-blocklist | 220433 + pdf | wrong-mimetype | 189804 + pdf | spn2-cdx-lookup-failure | 103926 + pdf | spn-error | 101777 + pdf | wayback-error | 93517 + pdf | null-body | 87279 + pdf | invalid-host-resolution | 35305 + pdf | spn-remote-error | 28888 + pdf | petabox-error | 12406 + pdf | spn2-error | 2905 + pdf | spn2-error:job-failed | 2307 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 745 + pdf | spn2-error:proxy-error | 438 + pdf | spn2-error:invalid-url-syntax | 406 + pdf | spn2-error:soft-time-limit-exceeded | 405 + pdf | spn2-error:browser-running-error | 274 + (25 rows) + +Failures by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status != 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | domain | status | count + -------------+---------------------------------------+---------------------+-------- + pdf | ssl.fao.org | no-pdf-link | 862277 + pdf | www.researchgate.net | redirect-loop | 749094 + pdf | www.e-periodica.ch | no-pdf-link | 747370 + pdf | ieeexplore.ieee.org | redirect-loop | 707482 + pdf | plutof.ut.ee | no-pdf-link | 685341 + pdf | www.gbif.org | no-pdf-link | 670905 + pdf | dlc.library.columbia.edu | no-pdf-link | 508281 + pdf | figshare.com | no-pdf-link | 400501 + pdf | onlinelibrary.wiley.com | no-pdf-link | 399187 + pdf | watermark.silverchair.com | terminal-bad-status | 357188 + pdf | www.die-bonn.de | redirect-loop | 352903 + pdf | academic.oup.com | no-pdf-link | 346828 + pdf | iopscience.iop.org | terminal-bad-status | 345147 + pdf | linkinghub.elsevier.com | no-capture | 328434 + pdf | statisticaldatasets.data-planet.com | no-pdf-link | 312206 + pdf | cyberleninka.ru | link-loop | 309525 + pdf | www.tandfonline.com | no-pdf-link | 309146 + pdf | dialnet.unirioja.es | terminal-bad-status | 307572 + pdf | doi.pangaea.de | no-pdf-link | 304924 + pdf | journals.sagepub.com | no-pdf-link | 285774 + pdf | papers.ssrn.com | link-loop | 282415 + pdf | dialnet.unirioja.es | redirect-loop | 274476 + pdf | ieeexplore.ieee.org | link-loop | 273607 + pdf | catalog.paradisec.org.au | redirect-loop | 234653 + pdf | www.plate-archive.org | no-pdf-link | 209217 + pdf | zenodo.org | no-pdf-link | 200078 + pdf | zenodo.org | no-capture | 199025 + pdf | spectradspace.lib.imperial.ac.uk:8443 | no-pdf-link | 187084 + pdf | digi.ub.uni-heidelberg.de | no-pdf-link | 187039 + pdf | validate.perfdrive.com | no-pdf-link | 180191 + (30 rows) + +Success by domain: + + SELECT ingest_type, domain, status, COUNT((ingest_type, domain, status)) + FROM ( + SELECT + ingest_file_result.ingest_type as ingest_type, + ingest_file_result.status, + substring(ingest_file_result.terminal_url FROM '[^/]+://([^/]*)') AS domain + FROM ingest_file_result + LEFT JOIN ingest_request + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + ) t1 + WHERE t1.domain != '' + AND t1.status = 'success' + GROUP BY ingest_type, domain, status + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | domain | status | count + -------------+----------------------------+---------+--------- + pdf | www.jstage.jst.go.jp | success | 2244620 + pdf | europepmc.org | success | 1284770 + pdf | link.springer.com | success | 1017998 + pdf | www.scielo.br | success | 799577 + pdf | arxiv.org | success | 592622 + pdf | downloads.hindawi.com | success | 527278 + pdf | res.mdpi.com | success | 501093 + pdf | hal.archives-ouvertes.fr | success | 447877 + pdf | digital.library.unt.edu | success | 404460 + pdf | www.cambridge.org | success | 394666 + pdf | dergipark.org.tr | success | 373706 + pdf | journals.plos.org | success | 296994 + pdf | watermark.silverchair.com | success | 275562 + pdf | www.nature.com | success | 263836 + pdf | cds.cern.ch | success | 223057 + pdf | www.pnas.org | success | 220488 + pdf | s3-eu-west-1.amazonaws.com | success | 214558 + pdf | www.jbc.org | success | 205277 + pdf | www.redalyc.org | success | 193591 + pdf | iopscience.iop.org | success | 175796 + pdf | apps.dtic.mil | success | 170589 + pdf | zenodo.org | success | 167812 + pdf | peerj.com | success | 155620 + pdf | www.biorxiv.org | success | 149337 + pdf | 210.101.116.28 | success | 145706 + pdf | www.teses.usp.br | success | 145438 + pdf | absimage.aps.org | success | 144400 + pdf | hrcak.srce.hr | success | 134669 + pdf | www.erudit.org | success | 131771 + pdf | babel.hathitrust.org | success | 130645 + (30 rows) + + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + => NOT RUN, fatcat_file table is way out of date + diff --git a/sql/stats/2020-07-23_stats.txt b/sql/stats/2020-07-23_stats.txt new file mode 100644 index 0000000..d1993fc --- /dev/null +++ b/sql/stats/2020-07-23_stats.txt @@ -0,0 +1,347 @@ + +Summary: + +- very many more PDFs have been grobid-ed vs. pdf_meta-ed +- about 1 million file_meta still have partial metadata (eg, no sha256) +- database size still under 0.5 TByte +- there are about a million CDX error ingest requests, and hundreds of + thousands of SPN errors which could be re-run + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 42 GB | 42 GB | 84 GB + "public"."ingest_request" | 34 GB | 39 GB | 73 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."grobid" | 61 GB | 7742 MB | 69 GB + "public"."file_meta" | 32 GB | 29 GB | 61 GB + "public"."ingest_file_result" | 24 GB | 36 GB | 60 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 8018 MB | 1966 MB | 9984 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + Size: 466.91G + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 161944425 | 204,402,677,360,189 + (1 row) + + # 161.9 mil; 204 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 161691608 + application/octet-stream | 154348 + application/xml | 42170 + text/html | 18703 + text/plain | 15989 + application/gzip | 6484 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + --------- + 1015337 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 96537611 | 116281981 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ---------------------------------------------------+----------- + application/pdf | 108706978 + warc/revisit | 5912013 + text/xml | 519042 + application/octet-stream | 307782 + text/html | 295634 + unk | 156937 + application/postscript | 81079 + application/save | 80871 + binary/octet-stream | 61263 + text/plain | 31495 + application/x-download | 30511 + application/download | 26716 + image/pdf | 26357 + application/force-download | 10541 + multipart/form-data | 5551 + application/x-msdownload | 3724 + application/x-octetstream | 3216 + application | 3171 + .pdf | 2728 + application/x-pdf | 2563 + application/binary | 1306 + application/pdf' | 1192 + pdf | 1180 + [...] + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + + total_files | unique_releases + -------------+----------------- + 95557413 | 18020570 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + + status_code | count + -------------+---------- + 200 | 88450610 + 500 | 7101098 + -4 | 4133 + 503 | 110 + + SELECT status, COUNT(*) FROM grobid GROUP BY ORDER BY COUNT DESC LIMIT 10; + + status | count + ----------------+---------- + success | 73814297 + | 14638412 + error | 7101308 + error-timeout | 4133 + bad-grobid-xml | 6 + (5 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 73813427 + | 14638425 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 16589669 + pdf | pmc | 2231113 + pdf | arxiv | 794693 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 148 + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8320832 + pdf | doi | fatcat-changelog | 4752956 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 630750 + pdf | pmc | fatcat-ingest | 194781 + pdf | arxiv | fatcat-changelog | 163924 + pdf | pmc | fatcat-changelog | 7507 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 148 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 19 + pdf | arxiv | savepapernow-web | 2 + +Uncrawled requests by source: + + # TODO: verify this? seems wrong + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | mag | 4097008 + pdf | oai | 15287 + pdf | unpaywall | 1 + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 5346057 | 0.104 + pdf | mag | 35015357 | 22199271 | 0.634 + pdf | unpaywall | 27653003 | 22067338 | 0.798 + pdf | doi | 16589700 | 3207661 | 0.193 + pdf | pmc | 2231113 | 1696976 | 0.761 + pdf | arxiv | 794727 | 645607 | 0.812 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 148 | 114 | 0.770 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 46465271 + pdf | no-capture | 46115869 + pdf | no-pdf-link | 13877460 + pdf | redirect-loop | 5943956 + pdf | terminal-bad-status | 1962754 + pdf | link-loop | 1630078 + pdf | cdx-error | 1014409 + pdf | gateway-timeout | 459340 + pdf | wrong-mimetype | 321774 + pdf | skip-url-blocklist | 220629 + pdf | wayback-error | 220453 + pdf | spn2-cdx-lookup-failure | 143963 + pdf | null-body | 113384 + pdf | spn-error | 101773 + pdf | invalid-host-resolution | 37367 + pdf | spn-remote-error | 28886 + pdf | petabox-error | 22997 + pdf | spn2-error | 16342 + pdf | spn2-error:job-failed | 5017 + pdf | other-mimetype | 2305 + pdf | redirects-exceeded | 746 + pdf | spn2-error:soft-time-limit-exceeded | 632 + pdf | spn2-error:proxy-error | 437 + pdf | spn2-error:invalid-url-syntax | 417 + pdf | timeout | 417 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + + total_count | release_count + -------------+--------------- + 5862666 | 4728824 + (1 row) + +## PDF Meta + +Total rows: + + SELECT COUNT(*) as total_count FROM pdf_meta; + + + total_count + ------------- + 21961874 + +By status: + + SELECT status, COUNT(*) from pdf_meta GROUP BY status ORDER BY COUNT(*) DESC; + + status | count + ----------------+---------- + success | 21788507 + parse-error | 78196 + text-too-large | 60595 + not-pdf | 31679 + error-wayback | 2639 + bad-unicode | 251 + bad-pdf | 6 + empty-blob | 1 + (8 rows) + diff --git a/sql/stats/2020-09-14_stats.txt b/sql/stats/2020-09-14_stats.txt new file mode 100644 index 0000000..3bc27b0 --- /dev/null +++ b/sql/stats/2020-09-14_stats.txt @@ -0,0 +1,340 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 44 GB | 45 GB | 89 GB + "public"."grobid" | 66 GB | 8127 MB | 74 GB + "public"."ingest_request" | 34 GB | 40 GB | 73 GB + "public"."ingest_file_result" | 28 GB | 44 GB | 72 GB + "public"."grobid_shadow" | 64 GB | 6902 MB | 71 GB + "public"."file_meta" | 33 GB | 30 GB | 63 GB + "public"."shadow" | 9111 MB | 10204 MB | 19 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."pdf_meta" | 12 GB | 2924 MB | 15 GB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (11 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 167021210 | 221982345333674 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 10; + + mimetype | count + -------------------------------+----------- + application/pdf | 166765214 + application/octet-stream | 155517 + application/xml | 42170 + text/html | 18708 + text/plain | 15990 + application/gzip | 6491 + | 6036 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + (10 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62960 + (1 row) + + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 102123051 | 126550160 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 25; + + mimetype | count + ----------------------------+----------- + application/pdf | 116885565 + warc/revisit | 7951816 + text/xml | 519042 + application/octet-stream | 327639 + text/html | 295725 + unk | 172491 + application/postscript | 81095 + application/save | 80900 + binary/octet-stream | 61783 + text/plain | 33684 + image/pdf | 32856 + application/x-download | 32418 + application/download | 27672 + application/force-download | 10892 + multipart/form-data | 5750 + application/x-msdownload | 3832 + application/x-octetstream | 3516 + application | 3499 + .pdf | 3038 + application/x-pdf | 2701 + application/binary | 1322 + pdf | 1232 + file/unknown | 1199 + application/pdf' | 1192 + file | 979 + (25 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 101494314 | 18919012 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 10; + + status_code | count + -------------+---------- + 200 | 93730358 + 500 | 7759103 + -4 | 4683 + 503 | 150 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 10; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 80838234 + | 12892145 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; +unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 27653003 + pdf | doi | 17362763 + pdf | pmc | 2248854 + pdf | arxiv | 835400 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 197 + (9 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 27653003 + pdf | doi | fatcat-ingest | 8399261 + pdf | doi | fatcat-changelog | 5449349 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 634665 + pdf | pmc | fatcat-ingest | 210453 + pdf | arxiv | fatcat-changelog | 200707 + pdf | pmc | fatcat-changelog | 9582 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 197 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | doi | savepapernow-web | 21 + pdf | arxiv | savepapernow-web | 2 + (17 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 170304 + pdf | oai | 15287 + pdf | unpaywall | 1 + (3 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14144314 | 0.276 + pdf | mag | 35015357 | 24811947 | 0.709 + pdf | unpaywall | 27653003 | 22302629 | 0.807 + pdf | doi | 17363369 | 3533568 | 0.204 + pdf | pmc | 2248860 | 1713197 | 0.762 + pdf | arxiv | 835400 | 685219 | 0.820 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 197 | 138 | 0.701 + (9 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 25; + + ingest_type | status | count + -------------+-------------------------------------+---------- + pdf | success | 58265365 + pdf | no-pdf-link | 27216435 + pdf | no-capture | 21982611 + pdf | redirect-loop | 8457469 + pdf | terminal-bad-status | 2695023 + pdf | link-loop | 2209672 + pdf | wrong-mimetype | 767508 + pdf | gateway-timeout | 548870 + pdf | cdx-error | 391611 + pdf | skip-url-blocklist | 220661 + pdf | null-body | 182215 + pdf | wayback-error | 146869 + pdf | spn2-cdx-lookup-failure | 107229 + pdf | spn-error | 85128 + pdf | invalid-host-resolution | 37352 + pdf | petabox-error | 32490 + pdf | spn2-error | 29212 + pdf | spn-remote-error | 27927 + pdf | other-mimetype | 2305 + pdf | bad-redirect | 1524 + pdf | spn2-error:job-failed | 1521 + pdf | timeout | 842 + pdf | spn2-error:soft-time-limit-exceeded | 793 + pdf | redirects-exceeded | 748 + pdf | spn2-error:invalid-url-syntax | 417 + (25 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) + FROM ingest_file_result + WHERE hit = false + GROUP BY ingest_type, terminal_status_code + ORDER BY COUNT DESC + LIMIT 25; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 34064937 + pdf | | 20514531 + pdf | 301 | 7271700 + pdf | 302 | 720632 + pdf | 503 | 712697 + pdf | 400 | 444209 + pdf | 404 | 331495 + pdf | 403 | 323030 + pdf | 401 | 259327 + pdf | 500 | 236122 + pdf | 303 | 101609 + pdf | 429 | 47738 + pdf | 502 | 36183 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14790 + pdf | 999 | 8996 + pdf | 307 | 3769 + pdf | 308 | 3422 + pdf | 202 | 3228 + pdf | 520 | 2058 + pdf | 410 | 1734 + pdf | 521 | 1033 + pdf | 504 | 868 + pdf | 505 | 424 + (25 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 6600758 | 5213294 + (1 row) + diff --git a/sql/stats/2021-04-07_stats.txt b/sql/stats/2021-04-07_stats.txt new file mode 100644 index 0000000..fca76b9 --- /dev/null +++ b/sql/stats/2021-04-07_stats.txt @@ -0,0 +1,430 @@ + +## SQL Table Sizes + + Size: 551.34G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 50 GB | 100 GB + "public"."ingest_file_result" | 33 GB | 52 GB | 85 GB + "public"."ingest_request" | 39 GB | 45 GB | 83 GB + "public"."grobid" | 70 GB | 8613 MB | 78 GB + "public"."grobid_shadow" | 67 GB | 7208 MB | 74 GB + "public"."file_meta" | 35 GB | 31 GB | 66 GB + "public"."pdf_meta" | 19 GB | 4925 MB | 24 GB + "public"."shadow" | 9517 MB | 10 GB | 20 GB + "public"."fatcat_file" | 12 GB | 6656 MB | 18 GB + "public"."html_meta" | 1172 MB | 10 MB | 1182 MB + "public"."pdftrio" | 618 MB | 432 MB | 1051 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB + (12 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 174200807 | 234313766162033 + (1 row) + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 173816433 + application/octet-stream | 155534 + text/html | 115821 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 721 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 297 + application/x-compress | 272 + application/zip | 131 + application/CDFV2-unknown | 99 + image/png | 88 + application/mac-binhex40 | 79 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + application/vnd.ms-excel | 28 + message/rfc822 | 26 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62271 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 113880640 | 141793694 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 131346703 + warc/revisit | 8394443 + text/xml | 525481 + application/octet-stream | 502400 + text/html | 417579 + unk | 186703 + application/postscript | 81095 + application/save | 80915 + binary/octet-stream | 66698 + application/x-download | 35771 + text/plain | 35606 + image/pdf | 33904 + application/download | 29701 + application/force-download | 16726 + multipart/form-data | 6878 + application/x-msdownload | 3843 + application | 3724 + application/x-octetstream | 3550 + .pdf | 3138 + application/x-pdf | 2780 + application/binary | 1332 + pdf | 1247 + file/unknown | 1200 + application/pdf' | 1192 + file | 1108 + application/unknown | 978 + application/octetstream | 856 + application/blob | 673 + text/pdf | 672 + 0 | 546 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files, COUNT(DISTINCT fatcat_release) AS unique_releases FROM grobid; + + total_files | unique_releases + -------------+----------------- + 105594307 | 19594878 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+---------- + 200 | 97714631 + 500 | 7875192 + -4 | 4772 + 503 | 520 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------+---------- + 0.5.5-fatcat | 84822508 + | 12892147 + (2 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 35015357 + pdf | unpaywall | 31772942 + pdf | doi | 23528817 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2277417 + pdf | arxiv | 2143549 + xml | doaj | 9442 + html | doi | 3022 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 469 + html | spn | 9 + (14 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 35015357 + pdf | unpaywall | unpaywall | 31772942 + pdf | doi | fatcat-changelog | 11010764 + pdf | doi | fatcat-ingest | 9002119 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767703 + pdf | arxiv | fatcat-changelog | 375818 + pdf | pmc | fatcat-ingest | 211264 + pdf | pmc | fatcat-changelog | 37328 + xml | doaj | doaj | 9442 + html | doi | fatcat-ingest | 3018 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 469 + pdf | doi | savepapernow-web | 74 + pdf | arxiv | fatcat-ingest-container | 26 + html | spn | savepapernow-web | 9 + html | doi | savepapernow-web | 4 + pdf | arxiv | savepapernow-web | 2 + (23 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 168462 + pdf | oai | 15286 + pdf | doaj | 2068 + html | doaj | 620 + pdf | unpaywall | 13 + (5 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14163500 | 0.277 + pdf | mag | 35015357 | 24818176 | 0.709 + pdf | unpaywall | 31772942 | 25018501 | 0.787 + pdf | doi | 23529041 | 5773728 | 0.245 + pdf | doaj | 4264610 | 2851328 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2277417 | 1736491 | 0.762 + pdf | arxiv | 2143549 | 2011378 | 0.938 + xml | doaj | 9442 | 6897 | 0.730 + html | doi | 3022 | 957 | 0.317 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 469 | 328 | 0.699 + html | spn | 9 | 2 | 0.222 + (14 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+--------------------------------+---------- + pdf | success | 66487928 + pdf | no-pdf-link | 29279677 + pdf | no-capture | 22765431 + pdf | redirect-loop | 9155767 + pdf | terminal-bad-status | 3549665 + pdf | link-loop | 2592983 + html | wrong-scope | 1088793 + pdf | wrong-mimetype | 792563 + pdf | gateway-timeout | 478181 + html | no-capture | 423917 + pdf | wayback-content-error | 355828 + pdf | cdx-error | 343862 + pdf | null-body | 328774 + pdf | forbidden | 286647 + pdf | spn2-cdx-lookup-failure | 276769 + pdf | spn2-wayback-error | 276080 + pdf | skip-url-blocklist | 265473 + html | redirect-loop | 212916 + pdf | not-found | 204367 + html | unknown-scope | 204112 + html | html-resource-no-capture | 166034 + pdf | blocked-cookie | 160336 + pdf | too-many-redirects | 152984 + html | success | 123896 + pdf | wayback-error | 114388 + html | null-body | 100296 + pdf | spn2-error:too-many-redirects | 58336 + html | wayback-content-error | 53926 + pdf | invalid-host-resolution | 37226 + pdf | petabox-error | 37177 + pdf | remote-server-error | 36439 + pdf | spn2-error | 27556 + pdf | spn2-error:proxy-error | 25486 + pdf | read-timeout | 20745 + html | wrong-mimetype | 18928 + html | terminal-bad-status | 14059 + html | petabox-error | 13533 + pdf | bad-redirect | 7535 + xml | success | 6897 + html | cdx-error | 6823 + pdf | spn2-error:bad-request | 4664 + pdf | spn2-error:unauthorized | 4391 + pdf | spn-remote-error | 4206 + pdf | spn2-error:service-unavailable | 2614 + pdf | spn2-error:job-failed | 2562 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1905 + html | spn2-cdx-lookup-failure | 1018 + pdf | redirects-exceeded | 1015 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36515867 + pdf | | 22909334 + pdf | 301 | 7969702 + html | 200 | 1653303 + pdf | 503 | 928507 + pdf | 403 | 823755 + pdf | 302 | 792842 + pdf | 400 | 462108 + html | | 426474 + pdf | 404 | 422163 + pdf | 401 | 270611 + pdf | 500 | 248675 + html | 301 | 211713 + pdf | 303 | 109686 + pdf | 410 | 50648 + pdf | 502 | 37663 + pdf | 429 | 31982 + pdf | 420 | 26603 + pdf | 509 | 15113 + pdf | 409 | 14835 + html | 404 | 9573 + pdf | 999 | 9296 + pdf | 307 | 3972 + pdf | 308 | 3914 + html | 500 | 3625 + pdf | 202 | 3515 + xml | 200 | 2537 + pdf | 520 | 2072 + pdf | 206 | 1665 + pdf | 521 | 1075 + html | 302 | 1072 + pdf | 504 | 1000 + pdf | 412 | 476 + pdf | 300 | 434 + pdf | 505 | 429 + pdf | 406 | 393 + html | 403 | 382 + html | 503 | 378 + pdf | 421 | 298 + html | 303 | 268 + pdf | 508 | 195 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 408 | 50 + pdf | 204 | 34 + pdf | 416 | 29 + pdf | 501 | 29 + pdf | 530 | 27 + pdf | 507 | 21 + (50 rows) + +## Fatcat Files + +Count of PDF files that GROBID processed and matched to a release (via +glutton), but no PDF in `fatcat_file`: + + SELECT COUNT(*) as total_count, COUNT(DISTINCT grobid.fatcat_release) as release_count + FROM grobid + LEFT JOIN fatcat_file ON grobid.sha1hex = fatcat_file.sha1hex + WHERE fatcat_file.sha1hex IS NULL + AND grobid.fatcat_release IS NOT NULL; + + total_count | release_count + -------------+--------------- + 8514315 | 6401104 + (1 row) diff --git a/sql/stats/2021-04-08_table_sizes.txt b/sql/stats/2021-04-08_table_sizes.txt new file mode 100644 index 0000000..a8a9cd5 --- /dev/null +++ b/sql/stats/2021-04-08_table_sizes.txt @@ -0,0 +1,40 @@ + +## SQL Table Sizes + + Size: 467.23G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + -------------------------------+------------+--------------+------------ + "public"."cdx" | 49 GB | 26 GB | 76 GB + "public"."grobid" | 69 GB | 6834 MB | 75 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."ingest_request" | 39 GB | 32 GB | 70 GB + "public"."ingest_file_result" | 32 GB | 29 GB | 60 GB + "public"."file_meta" | 32 GB | 21 GB | 53 GB + "public"."pdf_meta" | 18 GB | 3733 MB | 22 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1196 MB | 8072 kB | 1204 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + (12 rows) + diff --git a/sql/stats/2021-04-12_ingest_domain_summary_30d.txt b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt new file mode 100644 index 0000000..6811b54 --- /dev/null +++ b/sql/stats/2021-04-12_ingest_domain_summary_30d.txt @@ -0,0 +1,345 @@ + domain | status | count +---------------------------------------+-------------------------+-------- + academic.oup.com | | 4105 + academic.oup.com | spn2-wayback-error | 1393 + academic.oup.com | link-loop | 1025 + academic.oup.com | no-pdf-link | 1020 + academic.oup.com | spn2-cdx-lookup-failure | 512 + acervus.unicamp.br | | 1967 + acervus.unicamp.br | no-pdf-link | 1853 + acp.copernicus.org | | 620 + acp.copernicus.org | success | 537 + aip.scitation.org | | 1310 + aip.scitation.org | blocked-cookie | 1192 + alustath.uobaghdad.edu.iq | | 697 + alustath.uobaghdad.edu.iq | success | 550 + apex.ipk-gatersleben.de | | 1253 + apex.ipk-gatersleben.de | no-pdf-link | 1132 + apps.crossref.org | | 4693 + apps.crossref.org | no-pdf-link | 4075 + arxiv.org | | 14990 + arxiv.org | success | 12899 + arxiv.org | spn2-wayback-error | 1592 + ashpublications.org | | 563 + asmedigitalcollection.asme.org | | 3990 + asmedigitalcollection.asme.org | spn2-cdx-lookup-failure | 1570 + asmedigitalcollection.asme.org | no-pdf-link | 1449 + asmedigitalcollection.asme.org | link-loop | 734 + assets.researchsquare.com | | 8217 + assets.researchsquare.com | success | 7116 + assets.researchsquare.com | spn2-wayback-error | 946 + av.tib.eu | | 526 + bioone.org | | 588 + books.openedition.org | | 1784 + books.openedition.org | no-pdf-link | 1466 + boris.unibe.ch | | 1420 + boris.unibe.ch | success | 743 + brill.com | | 1773 + brill.com | link-loop | 879 + chemrxiv.org | | 857 + chemrxiv.org | no-pdf-link | 519 + classiques-garnier.com | | 1072 + classiques-garnier.com | success | 807 + content.iospress.com | | 793 + content.iospress.com | link-loop | 568 + cyberdoi.ru | | 775 + cyberdoi.ru | redirect-loop | 775 + cyberleninka.ru | | 1453 + cyberleninka.ru | success | 1092 + d197for5662m48.cloudfront.net | | 632 + d197for5662m48.cloudfront.net | success | 544 + dergipark.org.tr | | 3070 + dergipark.org.tr | success | 1251 + dergipark.org.tr | no-pdf-link | 843 + dergipark.org.tr | spn2-wayback-error | 677 + digi.ub.uni-heidelberg.de | | 502 + dione.lib.unipi.gr | | 783 + direct.mit.edu | | 996 + direct.mit.edu | no-pdf-link | 869 + dl.acm.org | | 1692 + dl.acm.org | blocked-cookie | 1558 + dlc.library.columbia.edu | | 4225 + dlc.library.columbia.edu | no-pdf-link | 2395 + dlc.library.columbia.edu | spn2-wayback-error | 1568 + doi.ala.org.au | | 2570 + doi.ala.org.au | no-pdf-link | 2153 + doi.nrct.go.th | | 566 + doi.org | | 10408 + doi.org | spn2-cdx-lookup-failure | 9593 + doi.org | terminal-bad-status | 741 + downloads.hindawi.com | | 2137 + downloads.hindawi.com | success | 1787 + dram.journals.ekb.eg | | 541 + elib.spbstu.ru | | 1243 + elib.spbstu.ru | redirect-loop | 1214 + elibrary.vdi-verlag.de | | 1542 + elibrary.vdi-verlag.de | spn2-wayback-error | 721 + elifesciences.org | | 689 + elifesciences.org | success | 521 + epos.myesr.org | | 705 + epos.myesr.org | spn2-wayback-error | 604 + europepmc.org | | 6996 + europepmc.org | success | 6031 + europepmc.org | spn2-wayback-error | 756 + figshare.com | | 1168 + figshare.com | no-pdf-link | 726 + files.osf.io | | 1526 + files.osf.io | success | 1078 + fjfsdata01prod.blob.core.windows.net | | 5410 + fjfsdata01prod.blob.core.windows.net | success | 4581 + fjfsdata01prod.blob.core.windows.net | spn2-wayback-error | 587 + fldeploc.dep.state.fl.us | | 774 + fldeploc.dep.state.fl.us | no-pdf-link | 718 + geoscan.nrcan.gc.ca | | 2056 + geoscan.nrcan.gc.ca | no-pdf-link | 2019 + hcommons.org | | 1593 + hcommons.org | success | 1333 + hkvalidate.perfdrive.com | | 1322 + hkvalidate.perfdrive.com | no-pdf-link | 1083 + ieeexplore.ieee.org | | 20997 + ieeexplore.ieee.org | too-many-redirects | 15383 + ieeexplore.ieee.org | spn2-wayback-error | 2555 + ieeexplore.ieee.org | success | 2165 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 747 + jamanetwork.com | | 712 + journals.aps.org | | 1698 + journals.aps.org | not-found | 1469 + journals.library.ualberta.ca | | 733 + journals.library.ualberta.ca | success | 594 + journals.lww.com | | 6606 + journals.lww.com | link-loop | 3102 + journals.lww.com | spn2-wayback-error | 1645 + journals.lww.com | terminal-bad-status | 965 + journals.lww.com | spn2-cdx-lookup-failure | 552 + journals.openedition.org | | 4594 + journals.openedition.org | success | 1441 + journals.openedition.org | redirect-loop | 1316 + journals.openedition.org | spn2-wayback-error | 1197 + journals.ub.uni-heidelberg.de | | 1039 + journals.ub.uni-heidelberg.de | success | 728 + kiss.kstudy.com | | 747 + kiss.kstudy.com | no-pdf-link | 686 + library.iated.org | | 1560 + library.iated.org | redirect-loop | 1148 + linkinghub.elsevier.com | | 5079 + linkinghub.elsevier.com | forbidden | 2226 + linkinghub.elsevier.com | spn2-wayback-error | 1625 + linkinghub.elsevier.com | spn2-cdx-lookup-failure | 758 + mr.crossref.org | | 542 + nsuworks.nova.edu | | 843 + nsuworks.nova.edu | success | 746 + ojs.cvut.cz | | 805 + ojs.cvut.cz | success | 764 + ojs.ugent.be | | 867 + ojs.ugent.be | success | 643 + onepetro.org | | 603 + onlinelibrary.wiley.com | | 1203 + onlinelibrary.wiley.com | blocked-cookie | 758 + open.library.ubc.ca | | 559 + osf.io | | 3139 + osf.io | not-found | 2288 + osf.io | spn2-wayback-error | 582 + oxford.universitypressscholarship.com | | 3556 + oxford.universitypressscholarship.com | link-loop | 2373 + oxford.universitypressscholarship.com | spn2-wayback-error | 562 + painphysicianjournal.com | | 804 + painphysicianjournal.com | success | 668 + papers.ssrn.com | | 6367 + papers.ssrn.com | link-loop | 3865 + papers.ssrn.com | spn2-wayback-error | 1106 + papers.ssrn.com | spn2-cdx-lookup-failure | 1015 + peerj.com | | 785 + peerj.com | no-pdf-link | 552 + pos.sissa.it | | 1455 + pos.sissa.it | success | 1153 + preprints.jmir.org | | 763 + preprints.jmir.org | no-pdf-link | 611 + psyarxiv.com | | 641 + psyarxiv.com | no-pdf-link | 546 + publikationen.uni-tuebingen.de | | 659 + publons.com | | 6998 + publons.com | no-pdf-link | 6982 + pubs.acs.org | | 5860 + pubs.acs.org | blocked-cookie | 5185 + pubs.rsc.org | | 2269 + pubs.rsc.org | link-loop | 1384 + res.mdpi.com | | 15776 + res.mdpi.com | success | 13710 + res.mdpi.com | spn2-wayback-error | 1424 + res.mdpi.com | spn2-cdx-lookup-failure | 641 + rrs.scholasticahq.com | | 1078 + rrs.scholasticahq.com | success | 803 + rsdjournal.org | | 755 + rsdjournal.org | success | 524 + s3-eu-west-1.amazonaws.com | | 3343 + s3-eu-west-1.amazonaws.com | success | 2893 + saemobilus.sae.org | | 795 + saemobilus.sae.org | no-pdf-link | 669 + sage.figshare.com | | 725 + scholar.dkyobobook.co.kr | | 1043 + scholar.dkyobobook.co.kr | no-pdf-link | 915 + scholarworks.umass.edu | | 1196 + scholarworks.umass.edu | success | 713 + secure.jbs.elsevierhealth.com | | 4202 + secure.jbs.elsevierhealth.com | blocked-cookie | 4169 + storage.googleapis.com | | 1720 + storage.googleapis.com | success | 1466 + tandf.figshare.com | | 789 + tandf.figshare.com | no-pdf-link | 640 + tind-customer-agecon.s3.amazonaws.com | | 584 + turcomat.org | | 1196 + turcomat.org | spn2-wayback-error | 997 + unreserved.rba.gov.au | | 823 + unreserved.rba.gov.au | no-pdf-link | 821 + utpjournals.press | | 669 + utpjournals.press | blocked-cookie | 616 + watermark.silverchair.com | | 3560 + watermark.silverchair.com | success | 2788 + watermark.silverchair.com | spn2-wayback-error | 685 + wayf.switch.ch | | 1169 + wayf.switch.ch | no-pdf-link | 809 + www.ahajournals.org | | 802 + www.ahajournals.org | blocked-cookie | 597 + www.ajol.info | | 830 + www.ajol.info | success | 575 + www.ams.org | | 868 + www.ams.org | terminal-bad-status | 666 + www.atlantis-press.com | | 1579 + www.atlantis-press.com | success | 1071 + www.bloomsburycollections.com | | 1745 + www.bloomsburycollections.com | no-pdf-link | 1571 + www.brazilianjournals.com | | 1385 + www.brazilianjournals.com | success | 1107 + www.cairn.info | | 2479 + www.cairn.info | no-pdf-link | 818 + www.cairn.info | link-loop | 790 + www.cambridge.org | | 6801 + www.cambridge.org | no-pdf-link | 2990 + www.cambridge.org | spn2-wayback-error | 1475 + www.cambridge.org | link-loop | 940 + www.cambridge.org | success | 863 + www.cureus.com | | 538 + www.dbpia.co.kr | | 2958 + www.dbpia.co.kr | redirect-loop | 2953 + www.degruyter.com | | 58612 + www.degruyter.com | no-pdf-link | 41065 + www.degruyter.com | spn2-wayback-error | 7426 + www.degruyter.com | success | 6628 + www.degruyter.com | spn2-cdx-lookup-failure | 1624 + www.degruyter.com | terminal-bad-status | 1565 + www.dovepress.com | | 869 + www.dovepress.com | success | 597 + www.e-manuscripta.ch | | 1047 + www.e3s-conferences.org | | 817 + www.e3s-conferences.org | success | 606 + www.elgaronline.com | | 535 + www.elibrary.ru | | 1244 + www.elibrary.ru | no-pdf-link | 1159 + www.emc2020.eu | | 791 + www.emc2020.eu | no-pdf-link | 748 + www.emerald.com | | 2420 + www.emerald.com | no-pdf-link | 1986 + www.eurekaselect.com | | 540 + www.eurosurveillance.org | | 786 + www.eurosurveillance.org | success | 710 + www.finersistemas.com | | 1220 + www.finersistemas.com | success | 1214 + www.frontiersin.org | | 915 + www.frontiersin.org | spn2-wayback-error | 602 + www.hanspub.org | | 618 + www.humankineticslibrary.com | | 1122 + www.humankineticslibrary.com | no-pdf-link | 985 + www.ijcmas.com | | 513 + www.inderscience.com | | 1532 + www.inderscience.com | no-pdf-link | 1217 + www.indianjournals.com | | 904 + www.ingentaconnect.com | | 885 + www.ingentaconnect.com | no-pdf-link | 783 + www.journals.uchicago.edu | | 6055 + www.journals.uchicago.edu | blocked-cookie | 5927 + www.journals.vu.lt | | 791 + www.journals.vu.lt | success | 545 + www.jstage.jst.go.jp | | 1490 + www.jstage.jst.go.jp | remote-server-error | 1023 + www.jstor.org | | 1103 + www.jstor.org | redirect-loop | 553 + www.karger.com | | 733 + www.liebertpub.com | | 804 + www.liebertpub.com | blocked-cookie | 714 + www.liverpooluniversitypress.co.uk | | 620 + www.liverpooluniversitypress.co.uk | too-many-redirects | 529 + www.mdpi.com | | 3880 + www.mdpi.com | spn2-wayback-error | 1651 + www.mdpi.com | forbidden | 1282 + www.mdpi.com | spn2-cdx-lookup-failure | 714 + www.nepjol.info | | 596 + www.nomos-elibrary.de | | 2235 + www.nomos-elibrary.de | no-pdf-link | 1128 + www.nomos-elibrary.de | spn2-wayback-error | 559 + www.oecd-ilibrary.org | | 3046 + www.oecd-ilibrary.org | no-pdf-link | 2869 + www.osapublishing.org | | 821 + www.osapublishing.org | no-pdf-link | 615 + www.osti.gov | | 1147 + www.osti.gov | link-loop | 902 + www.oxfordscholarlyeditions.com | | 759 + www.oxfordscholarlyeditions.com | no-pdf-link | 719 + www.preprints.org | | 783 + www.preprints.org | success | 595 + www.repository.cam.ac.uk | | 1146 + www.research-collection.ethz.ch | | 704 + www.research-collection.ethz.ch | terminal-bad-status | 684 + www.researchsquare.com | | 853 + www.researchsquare.com | spn2-wayback-error | 515 + www.schweizerbart.de | | 730 + www.schweizerbart.de | no-pdf-link | 653 + www.scielo.br | | 1777 + www.scielo.br | success | 1167 + www.sciencedirect.com | | 14757 + www.sciencedirect.com | no-pdf-link | 12733 + www.sciencedirect.com | spn2-wayback-error | 1503 + www.sciendo.com | | 1955 + www.sciendo.com | no-pdf-link | 1176 + www.scilook.eu | | 812 + www.scilook.eu | success | 563 + www.scirp.org | | 749 + www.tandfonline.com | | 11038 + www.tandfonline.com | blocked-cookie | 9994 + www.tandfonline.com | no-pdf-link | 663 + www.taylorfrancis.com | | 71514 + www.taylorfrancis.com | spn2-wayback-error | 36663 + www.taylorfrancis.com | no-pdf-link | 15098 + www.taylorfrancis.com | forbidden | 8699 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 6894 + www.taylorfrancis.com | link-loop | 3661 + www.thieme-connect.de | | 3687 + www.thieme-connect.de | redirect-loop | 1187 + www.thieme-connect.de | not-found | 945 + www.thieme-connect.de | no-pdf-link | 941 + www.worldscientific.com | | 1476 + www.worldscientific.com | blocked-cookie | 1323 + www.zora.uzh.ch | | 1118 + zenodo.org | | 43010 + zenodo.org | no-pdf-link | 22015 + zenodo.org | success | 12747 + zenodo.org | spn2-wayback-error | 4608 + zenodo.org | spn2-cdx-lookup-failure | 3215 + | | 725990 + | no-pdf-link | 209933 + | success | 206134 + | spn2-wayback-error | 127015 + | spn2-cdx-lookup-failure | 53384 + | blocked-cookie | 35867 + | link-loop | 25834 + | too-many-redirects | 16430 + | redirect-loop | 14648 + | forbidden | 13794 + | terminal-bad-status | 8055 + | not-found | 6399 + | remote-server-error | 2402 + | wrong-mimetype | 2011 + | spn2-error:unauthorized | 912 + | bad-redirect | 555 + | read-timeout | 530 +(341 rows) + diff --git a/sql/stats/2021-11-01_table_sizes.txt b/sql/stats/2021-11-01_table_sizes.txt new file mode 100644 index 0000000..57f7e57 --- /dev/null +++ b/sql/stats/2021-11-01_table_sizes.txt @@ -0,0 +1,19 @@ + +Size: 832.66G + + table_name | table_size | indexes_size | total_size +-------------------------------+------------+--------------+------------ + "public"."crossref" | 311 GB | 9812 MB | 320 GB + "public"."ingest_request" | 44 GB | 40 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6952 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 54 GB + "public"."pdf_meta" | 20 GB | 5813 MB | 26 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB +(13 rows) diff --git a/sql/stats/2021-11-26_stats.txt b/sql/stats/2021-11-26_stats.txt new file mode 100644 index 0000000..3a0e561 --- /dev/null +++ b/sql/stats/2021-11-26_stats.txt @@ -0,0 +1,424 @@ + +Date: Sat 27 Nov 2021 03:33:30 AM UTC + +## SQL Table Sizes + + Size: 937.28G + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 393 GB | 10127 MB | 403 GB + "public"."ingest_request" | 44 GB | 41 GB | 84 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6963 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 40 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5869 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1690 MB | 21 GB + "public"."fatcat_file" | 12 GB | 6602 MB | 18 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 179761501 | 244453538203113 + + # 179m files, 244 TB + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 179376819 + application/octet-stream | 155379 + text/html | 116102 + application/xml | 42170 + application/xhtml+xml | 24347 + text/plain | 15990 + application/jats+xml | 6899 + application/gzip | 6491 + | 6034 + application/postscript | 4912 + application/vnd.ms-powerpoint | 1672 + application/msword | 921 + application/x-bzip2 | 891 + image/jpeg | 794 + image/gif | 389 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 303 + application/x-compress | 272 + application/zip | 131 + image/png | 121 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 57 + application/x-dosexec | 51 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + text/rtf | 33 + application/x-dvi | 29 + application/x-rar | 29 + video/mp4 | 29 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 62196 + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 119049962 | 149169240 + + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 137271670 + warc/revisit | 9709493 + application/octet-stream | 590443 + text/xml | 525481 + text/html | 421030 + unk | 207442 + application/postscript | 81123 + application/save | 80988 + binary/octet-stream | 67476 + image/pdf | 39419 + application/x-download | 38278 + text/plain | 36159 + application/download | 34328 + application/force-download | 19729 + multipart/form-data | 9105 + application | 5299 + application/x-msdownload | 3851 + application/x-octetstream | 3649 + .pdf | 3318 + application/x-pdf | 2992 + pdf | 1484 + file | 1364 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1029 + application/unknown | 1000 + 0 | 764 + text/pdf | 704 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 111236904 + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 102962304 + 500 | 8269129 + -4 | 5013 + 503 | 548 + +TODO: how many failed, by mimetype? to check if we are (or have) run non-PDF +files through by mistake + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.5.5-fatcat | 89983404 + | 12892161 + 0.7.0-104-gbeebd9a6b | 86739 + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | mag | 43701948 + pdf | unpaywall | 37802895 + pdf | doi | 28736398 + pdf | doaj | 4264610 + html | doaj | 2429003 + pdf | pmc | 2383398 + pdf | arxiv | 2330054 + html | doi | 39725 + xml | doaj | 9442 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 689 + html | spn | 48 + xml | spn | 1 + (15 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | mag | mag-corpus | 43701948 + pdf | unpaywall | unpaywall | 37802895 + pdf | doi | fatcat-changelog | 16207728 + pdf | doi | fatcat-ingest | 9012282 + pdf | doaj | doaj | 4264610 + pdf | doi | fatcat-ingest-container | 3515873 + html | doaj | doaj | 2429003 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1767705 + pdf | arxiv | fatcat-changelog | 562320 + pdf | pmc | fatcat-ingest | 297527 + pdf | pmc | fatcat-changelog | 57046 + html | doi | fatcat-ingest | 37788 + xml | doaj | doaj | 9442 + pdf | cnki_covid19 | scrape-covid19 | 2034 + html | doi | fatcat-changelog | 1897 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 689 + pdf | doi | savepapernow-web | 613 + html | spn | savepapernow-web | 48 + html | doi | savepapernow-web | 40 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | spn | savepapernow-web | 1 + (25 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 169076 + pdf | oai | 15283 + pdf | doaj | 2063 + html | doaj | 620 + pdf | doi | 22 + pdf | unpaywall | 17 + + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 14554221 | 0.284 + pdf | mag | 43701948 | 32643175 | 0.747 + pdf | unpaywall | 37802895 | 29989257 | 0.793 + pdf | doi | 28736547 | 7690393 | 0.268 + pdf | doaj | 4264610 | 2851601 | 0.669 + html | doaj | 2429003 | 122937 | 0.051 + pdf | pmc | 2383398 | 1821071 | 0.764 + pdf | arxiv | 2330054 | 2159738 | 0.927 + html | doi | 39725 | 1235 | 0.031 + xml | doaj | 9442 | 6897 | 0.730 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 689 | 503 | 0.730 + html | spn | 48 | 5 | 0.104 + xml | spn | 1 | 0 | 0.000 + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 78944243 + pdf | no-pdf-link | 26270027 + pdf | no-capture | 23267156 + pdf | redirect-loop | 9837466 + pdf | terminal-bad-status | 4147454 + pdf | skip-url-blocklist | 3088907 + pdf | link-loop | 2953891 + pdf | blocked-cookie | 1855541 + html | wrong-scope | 1106171 + pdf | wrong-mimetype | 859941 + pdf | gateway-timeout | 729771 + pdf | spn2-cdx-lookup-failure | 584856 + html | no-capture | 423917 + pdf | forbidden | 390804 + pdf | cdx-error | 363091 + pdf | wayback-content-error | 354894 + pdf | null-body | 341698 + pdf | too-many-redirects | 307096 + pdf | not-found | 294592 + html | redirect-loop | 213032 + html | unknown-scope | 207923 + pdf | spn2-error | 192046 + html | html-resource-no-capture | 166119 + html | success | 124177 + pdf | wayback-error | 105385 + html | null-body | 100296 + pdf | spn2-wayback-error | 73176 + pdf | remote-server-error | 60908 + pdf | spn2-error:too-many-redirects | 58076 + pdf | skip-wall | 57744 + html | wayback-content-error | 53928 + pdf | read-timeout | 42465 + pdf | invalid-host-resolution | 37221 + pdf | petabox-error | 28765 + pdf | spn2-error:unknown | 23885 + html | wrong-mimetype | 18930 + pdf | bad-redirect | 14708 + html | terminal-bad-status | 14070 + html | petabox-error | 13770 + html | spn2-cdx-lookup-failure | 13002 + pdf | spn2-error:job-failed | 9721 + html | cdx-error | 7167 + xml | success | 6897 + pdf | spn2-error:bad-request | 4433 + pdf | spn-remote-error | 4206 + pdf | body-too-large | 3019 + xml | null-body | 2353 + pdf | other-mimetype | 2304 + pdf | error | 1900 + pdf | spn2-error:proxy-error | 1850 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 36821458 + pdf | | 26058729 + pdf | 301 | 8466302 + html | 200 | 1676730 + pdf | 503 | 1028504 + pdf | 302 | 949465 + pdf | 403 | 936737 + pdf | 404 | 687661 + pdf | 400 | 507303 + html | | 439356 + pdf | 401 | 288994 + pdf | 500 | 263775 + html | 301 | 211796 + pdf | 303 | 130719 + pdf | 410 | 66495 + pdf | 502 | 41760 + pdf | 429 | 35266 + pdf | 420 | 26722 + pdf | 409 | 15204 + pdf | 509 | 15113 + pdf | 999 | 11409 + html | 404 | 9578 + pdf | 307 | 8404 + pdf | 308 | 5514 + pdf | 202 | 4724 + html | 500 | 3628 + xml | 200 | 2537 + pdf | 520 | 2199 + pdf | 206 | 1694 + html | 302 | 1138 + pdf | 504 | 1124 + pdf | 521 | 1085 + pdf | 412 | 921 + pdf | 421 | 714 + pdf | 300 | 461 + pdf | 505 | 436 + pdf | 406 | 427 + pdf | 508 | 408 + html | 403 | 382 + html | 503 | 378 + html | 303 | 268 + pdf | 204 | 252 + pdf | 226 | 166 + pdf | 402 | 70 + html | 502 | 68 + pdf | 523 | 55 + pdf | 408 | 53 + pdf | 432 | 45 + pdf | 530 | 31 + pdf | 416 | 31 + (50 rows) diff --git a/sql/stats/2021-12-02_table_sizes.txt b/sql/stats/2021-12-02_table_sizes.txt new file mode 100644 index 0000000..b03c370 --- /dev/null +++ b/sql/stats/2021-12-02_table_sizes.txt @@ -0,0 +1,22 @@ + +Size: 940.66G + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 394 GB | 10138 MB | 404 GB + "public"."ingest_request" | 44 GB | 41 GB | 85 GB + "public"."cdx" | 52 GB | 28 GB | 80 GB + "public"."grobid" | 72 GB | 6978 MB | 79 GB + "public"."ingest_file_result" | 38 GB | 41 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 34 GB | 21 GB | 55 GB + "public"."pdf_meta" | 20 GB | 5930 MB | 26 GB + "public"."grobid_refs" | 19 GB | 1752 MB | 21 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 1200 MB | 8072 kB | 1208 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes +(16 rows) diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt new file mode 100644 index 0000000..bd20c5c --- /dev/null +++ b/sql/stats/2022-04-26_stats.txt @@ -0,0 +1,432 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 416 GB | 10 GB | 426 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 58 GB | 41 GB | 99 GB + "public"."ingest_request" | 50 GB | 48 GB | 98 GB + "public"."ingest_file_result" | 42 GB | 48 GB | 90 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."file_meta" | 37 GB | 34 GB | 71 GB + "public"."pdf_meta" | 21 GB | 7386 MB | 29 GB + "public"."grobid_refs" | 23 GB | 2516 MB | 26 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 3015 MB | 31 MB | 3046 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 192402128 | 271919997557597 + (1 row) + + # 271,919,997,557,597 -> ~272 TByte + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 191760695 + text/html | 330351 + application/octet-stream | 186696 + application/xml | 42170 + application/xhtml+xml | 31470 + text/plain | 16449 + application/jats+xml | 6902 + application/gzip | 6681 + | 6033 + application/postscript | 4916 + image/jpeg | 2901 + application/vnd.ms-powerpoint | 1672 + application/msword | 934 + application/x-bzip2 | 891 + image/png | 476 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 374 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 294 + application/x-compress | 274 + video/mp4 | 150 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12831 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 130732381 | 162760251 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 149749828 + warc/revisit | 10437210 + application/octet-stream | 733161 + text/html | 642992 + text/xml | 525483 + unk | 217642 + application/postscript | 81127 + application/save | 81023 + binary/octet-stream | 67938 + application/x-download | 41137 + image/pdf | 39712 + application/download | 37153 + text/plain | 36342 + application/force-download | 21496 + multipart/form-data | 9792 + application | 5366 + application/x-octetstream | 5166 + application/x-msdownload | 3851 + .pdf | 3445 + application/x-pdf | 3018 + pdf | 1618 + file | 1370 + application/binary | 1354 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1047 + application/unknown | 1001 + 0 | 773 + text/pdf | 729 + application/blob | 673 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 123669603 + (1 row) + + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 115668412 + 500 | 7995428 + -4 | 5745 + 503 | 18 + (4 rows) + + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 54780825 + 0.5.5-fatcat | 48003940 + | 12694404 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | mag | 43701948 + pdf | doi | 40044585 + pdf | doaj | 6016771 + html | doaj | 3648181 + pdf | arxiv | 2676200 + pdf | pmc | 2402453 + html | doi | 41492 + xml | doaj | 20638 + pdf | cnki_covid19 | 2034 + pdf | wanfang_covid19 | 975 + pdf | spn | 829 + html | spn | 52 + xml | doi | 1 + xml | spn | 1 + (16 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 20936949 + pdf | doi | fatcat-ingest | 15590201 + pdf | doaj | doaj | 6016771 + html | doaj | doaj | 3648181 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 691405 + pdf | pmc | fatcat-ingest | 297646 + pdf | pmc | fatcat-changelog | 75982 + html | doi | fatcat-ingest | 37904 + xml | doaj | doaj | 20638 + html | doi | fatcat-changelog | 3534 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1562 + pdf | wanfang_covid19 | scrape-covid19 | 975 + pdf | spn | savepapernow-web | 829 + html | doi | savepapernow-web | 54 + html | spn | savepapernow-web | 52 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 3 + xml | doi | savepapernow-web | 1 + xml | spn | savepapernow-web | 1 + (26 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + ingest_type | link_source | count + -------------+-------------+--------- + pdf | doaj | 1619621 + html | doaj | 1208412 + pdf | mag | 167653 + pdf | oai | 15282 + xml | doaj | 11196 + pdf | unpaywall | 270 + pdf | doi | 22 + (7 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 15968290 | 0.312 + pdf | unpaywall | 43932525 | 32618045 | 0.742 + pdf | mag | 43701948 | 32662926 | 0.747 + pdf | doi | 40044738 | 10925369 | 0.273 + pdf | doaj | 6016771 | 3042569 | 0.506 + html | doaj | 3648181 | 344208 | 0.094 + pdf | arxiv | 2676206 | 2269708 | 0.848 + pdf | pmc | 2402453 | 1855679 | 0.772 + html | doi | 41492 | 1739 | 0.042 + xml | doaj | 20638 | 6899 | 0.334 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + pdf | spn | 829 | 616 | 0.743 + html | spn | 52 | 7 | 0.135 + xml | doi | 1 | 0 | 0.000 + xml | spn | 1 | 0 | 0.000 + (16 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+---------------------------------+---------- + pdf | success | 85709322 + pdf | no-pdf-link | 29713304 + pdf | no-capture | 26632191 + pdf | redirect-loop | 10979145 + pdf | terminal-bad-status | 4977000 + pdf | link-loop | 3434877 + pdf | skip-url-blocklist | 3114258 + pdf | blocked-cookie | 2156835 + html | wrong-scope | 1126911 + pdf | wrong-mimetype | 980546 + pdf | gateway-timeout | 651562 + pdf | spn2-cdx-lookup-failure | 484016 + pdf | spn2-backoff | 399382 + pdf | cdx-error | 373964 + pdf | wayback-content-error | 354370 + html | success | 345860 + pdf | null-body | 336182 + pdf | spn2-error:500 | 309755 + pdf | forbidden | 291175 + pdf | not-found | 275560 + pdf | too-many-redirects | 262312 + html | unknown-scope | 230352 + html | redirect-loop | 226596 + html | html-resource-no-capture | 205646 + html | no-capture | 164014 + component | spn2-cdx-lookup-failure | 148825 + component | wrong-mimetype | 130344 + html | null-body | 100296 + pdf | wayback-error | 94286 + pdf | spn2-wayback-error | 81365 + component | no-capture | 75278 + pdf | spn2-error | 69830 + pdf | skip-wall | 57744 + pdf | spn2-error:too-many-redirects | 53808 + pdf | remote-server-error | 41286 + pdf | petabox-error | 38800 + pdf | invalid-host-resolution | 37337 + pdf | read-timeout | 36872 + component | spn2-backoff | 33217 + pdf | empty-blob | 27946 + component | spn2-error | 24078 + pdf | spn2-error:unknown | 23697 + component | gateway-timeout | 23139 + html | wrong-mimetype | 22731 + html | wayback-content-error | 20507 + pdf | spn2-error:host-crawling-paused | 19900 + pdf | bad-redirect | 19183 + html | terminal-bad-status | 13354 + component | blocked-cookie | 12287 + component | spn2-error:500 | 11271 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 38144779 + pdf | | 32762240 + pdf | 301 | 9433087 + html | 200 | 1716127 + pdf | 403 | 1416632 + pdf | 302 | 1134668 + pdf | 404 | 888853 + pdf | 401 | 746311 + pdf | 503 | 655894 + pdf | 400 | 531479 + component | | 337603 + pdf | 500 | 247944 + html | 301 | 224237 + html | | 167194 + pdf | 303 | 135048 + component | 200 | 130663 + pdf | 429 | 93489 + pdf | 410 | 67392 + pdf | 420 | 26722 + pdf | 502 | 18770 + pdf | 409 | 15152 + pdf | 509 | 15113 + pdf | 999 | 11747 + html | 404 | 9879 + pdf | 307 | 8895 + pdf | 412 | 7053 + pdf | 308 | 6627 + pdf | 202 | 5289 + xml | 200 | 2540 + html | 500 | 2480 + pdf | 520 | 2220 + pdf | 521 | 1844 + pdf | 206 | 1739 + html | 302 | 1407 + pdf | 504 | 1146 + html | 303 | 1123 + pdf | 421 | 986 + pdf | 406 | 938 + pdf | 204 | 498 + pdf | 505 | 468 + pdf | 300 | 436 + pdf | 508 | 422 + pdf | 426 | 405 + html | 429 | 402 + html | 403 | 398 + pdf | 432 | 366 + component | 301 | 294 + pdf | 405 | 210 + pdf | 226 | 166 + component | 302 | 128 + (50 rows) + diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt new file mode 100644 index 0000000..864abd4 --- /dev/null +++ b/sql/stats/2022-04-27_crawl_changelog.txt @@ -0,0 +1,191 @@ + domain | status | count +--------------------------------------+-------------------------+-------- + academic.oup.com | | 1243 + academic.oup.com | spn2-cdx-lookup-failure | 990 + aip.scitation.org | | 313 + aip.scitation.org | spn2-cdx-lookup-failure | 224 + ajps.uomustansiriyah.edu.iq | | 235 + apps.crossref.org | | 1329 + apps.crossref.org | spn2-cdx-lookup-failure | 942 + apps.crossref.org | no-pdf-link | 387 + archaeologydataservice.ac.uk | | 422 + archaeologydataservice.ac.uk | spn2-cdx-lookup-failure | 289 + arxiv.org | | 3512 + arxiv.org | spn2-cdx-lookup-failure | 2319 + arxiv.org | success | 1177 + assets.researchsquare.com | | 571 + assets.researchsquare.com | spn2-cdx-lookup-failure | 322 + assets.researchsquare.com | success | 249 + brill.com | | 397 + brill.com | spn2-cdx-lookup-failure | 265 + cla.berkeley.edu | | 239 + classiques-garnier.com | | 249 + cyberleninka.ru | | 340 + cyberleninka.ru | spn2-cdx-lookup-failure | 244 + dergipark.org.tr | | 468 + dergipark.org.tr | spn2-cdx-lookup-failure | 333 + dl.acm.org | | 592 + dl.acm.org | spn2-cdx-lookup-failure | 470 + doi.ala.org.au | | 288 + doi.ala.org.au | spn2-cdx-lookup-failure | 220 + doi.org | | 1107 + doi.org | terminal-bad-status | 679 + doi.org | spn2-cdx-lookup-failure | 415 + downloads.hindawi.com | | 279 + downloads.hindawi.com | success | 267 + edbs.uomustansiriyah.edu.iq | | 294 + edbs.uomustansiriyah.edu.iq | spn2-cdx-lookup-failure | 209 + elibrary.kdpu.edu.ua | | 320 + elibrary.kdpu.edu.ua | spn2-cdx-lookup-failure | 233 + elibrary.ru | | 722 + elibrary.ru | spn2-cdx-lookup-failure | 505 + europepmc.org | | 986 + europepmc.org | spn2-cdx-lookup-failure | 681 + europepmc.org | success | 291 + figshare.com | | 377 + figshare.com | spn2-cdx-lookup-failure | 328 + fjfsdata01prod.blob.core.windows.net | | 255 + fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure | 216 + hammer.purdue.edu | | 224 + ieeexplore.ieee.org | | 3904 + ieeexplore.ieee.org | spn2-cdx-lookup-failure | 2654 + ieeexplore.ieee.org | gateway-timeout | 792 + ieeexplore.ieee.org | spn2-backoff | 419 + journals.eco-vector.com | | 428 + journals.eco-vector.com | spn2-cdx-lookup-failure | 306 + journals.lww.com | | 727 + journals.lww.com | spn2-cdx-lookup-failure | 622 + journals.openedition.org | | 806 + journals.openedition.org | spn2-cdx-lookup-failure | 554 + journals.plos.org | | 348 + journals.plos.org | spn2-cdx-lookup-failure | 244 + kiss.kstudy.com | | 226 + kluwerlawonline.com | | 723 + kluwerlawonline.com | spn2-cdx-lookup-failure | 489 + kluwerlawonline.com | link-loop | 203 + linkinghub.elsevier.com | | 401 + linkinghub.elsevier.com | spn2-backoff | 342 + mdpi-res.com | | 1463 + mdpi-res.com | success | 1337 + muse.jhu.edu | | 346 + muse.jhu.edu | spn2-cdx-lookup-failure | 253 + onepetro.org | | 363 + onepetro.org | spn2-cdx-lookup-failure | 284 + online.ucpress.edu | | 1620 + online.ucpress.edu | spn2-cdx-lookup-failure | 1511 + onlinelibrary.wiley.com | | 2913 + onlinelibrary.wiley.com | spn2-cdx-lookup-failure | 2109 + onlinelibrary.wiley.com | terminal-bad-status | 787 + opendata.uni-halle.de | | 519 + opendata.uni-halle.de | spn2-cdx-lookup-failure | 343 + osf.io | | 1554 + osf.io | spn2-cdx-lookup-failure | 1350 + papers.ssrn.com | | 2207 + papers.ssrn.com | spn2-cdx-lookup-failure | 1727 + papers.ssrn.com | link-loop | 457 + psycharchives.org | | 384 + psycharchives.org | spn2-cdx-lookup-failure | 283 + publons.com | | 493 + publons.com | spn2-cdx-lookup-failure | 348 + pubs.acs.org | | 1240 + pubs.acs.org | spn2-cdx-lookup-failure | 881 + pubs.acs.org | terminal-bad-status | 298 + pubs.rsc.org | | 603 + pubs.rsc.org | spn2-cdx-lookup-failure | 460 + repositories.lib.utexas.edu | | 1861 + repositories.lib.utexas.edu | spn2-cdx-lookup-failure | 1288 + repositories.lib.utexas.edu | terminal-bad-status | 523 + s3-eu-west-1.amazonaws.com | | 216 + sage.figshare.com | | 374 + sage.figshare.com | spn2-cdx-lookup-failure | 309 + scholar.dkyobobook.co.kr | | 220 + scholarworks.gsu.edu | | 749 + scholarworks.gsu.edu | spn2-cdx-lookup-failure | 577 + tandf.figshare.com | | 214 + www.atlantis-press.com | | 338 + www.atlantis-press.com | spn2-cdx-lookup-failure | 214 + www.cairn.info | | 782 + www.cairn.info | spn2-cdx-lookup-failure | 541 + www.cambridge.org | | 2325 + www.cambridge.org | spn2-cdx-lookup-failure | 1787 + www.cambridge.org | no-pdf-link | 300 + www.cell.com | | 213 + www.concrete.org | | 476 + www.concrete.org | spn2-cdx-lookup-failure | 340 + www.dbpia.co.kr | | 375 + www.dbpia.co.kr | spn2-cdx-lookup-failure | 275 + www.degruyter.com | | 3849 + www.degruyter.com | spn2-cdx-lookup-failure | 2969 + www.degruyter.com | no-pdf-link | 712 + www.dib.ie | | 1100 + www.dib.ie | spn2-cdx-lookup-failure | 1038 + www.e-periodica.ch | | 821 + www.e-periodica.ch | spn2-cdx-lookup-failure | 620 + www.e-periodica.ch | no-pdf-link | 201 + www.elibrary.ru | | 401 + www.elibrary.ru | spn2-cdx-lookup-failure | 281 + www.emerald.com | | 390 + www.emerald.com | spn2-cdx-lookup-failure | 275 + www.eurekaselect.com | | 275 + www.frontiersin.org | | 1266 + www.frontiersin.org | spn2-cdx-lookup-failure | 1025 + www.hanspub.org | | 229 + www.hindawi.com | | 604 + www.hindawi.com | spn2-cdx-lookup-failure | 594 + www.inderscience.com | | 201 + www.jstage.jst.go.jp | | 1094 + www.jstage.jst.go.jp | spn2-cdx-lookup-failure | 807 + www.jstage.jst.go.jp | success | 206 + www.mdpi.com | | 4340 + www.mdpi.com | spn2-cdx-lookup-failure | 4258 + www.nomos-elibrary.de | | 2749 + www.nomos-elibrary.de | spn2-cdx-lookup-failure | 1909 + www.nomos-elibrary.de | redirect-loop | 819 + www.osti.gov | | 275 + www.oxfordhandbooks.com | | 248 + www.oxfordhandbooks.com | spn2-cdx-lookup-failure | 224 + www.pdcnet.org | | 217 + www.researchsquare.com | | 483 + www.researchsquare.com | spn2-cdx-lookup-failure | 317 + www.scielo.br | | 319 + www.scielo.br | spn2-cdx-lookup-failure | 222 + www.sciencedirect.com | | 3384 + www.sciencedirect.com | spn2-cdx-lookup-failure | 3267 + www.spiedigitallibrary.org | | 441 + www.spiedigitallibrary.org | spn2-cdx-lookup-failure | 327 + www.tandfonline.com | | 2401 + www.tandfonline.com | spn2-cdx-lookup-failure | 1552 + www.tandfonline.com | no-pdf-link | 303 + www.tandfonline.com | blocked-cookie | 250 + www.taylorfrancis.com | | 1232 + www.taylorfrancis.com | spn2-cdx-lookup-failure | 908 + www.thieme-connect.de | | 520 + www.thieme-connect.de | spn2-cdx-lookup-failure | 366 + www.worldscientific.com | | 383 + www.worldscientific.com | spn2-cdx-lookup-failure | 276 + zenodo.org | | 10625 + zenodo.org | spn2-cdx-lookup-failure | 7777 + zenodo.org | success | 1574 + zenodo.org | no-pdf-link | 1160 + zivahub.uct.ac.za | | 3428 + zivahub.uct.ac.za | spn2-cdx-lookup-failure | 2845 + zivahub.uct.ac.za | no-pdf-link | 583 + | | 130491 + | spn2-cdx-lookup-failure | 95169 + | success | 13354 + | no-pdf-link | 9621 + | terminal-bad-status | 3385 + | spn2-backoff | 2396 + | redirect-loop | 2216 + | link-loop | 1850 + | gateway-timeout | 1061 + | spn2-error:blocked-url | 428 + | blocked-cookie | 415 + | spn2-error | 246 +(182 rows) + +---- + +The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a +week or two, when crawling and retries are running smoothly, and see what +things look like then. diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt new file mode 100644 index 0000000..8d98217 --- /dev/null +++ b/sql/stats/2022-05-11_crawl_changelog.txt @@ -0,0 +1,410 @@ + domain | status | count +-----------------------------------------------------------------+-------------------------+-------- + academic.oup.com | | 2210 + academic.oup.com | no-pdf-link | 1350 + academic.oup.com | bad-redirect | 510 + academiccommons.columbia.edu | | 379 + academiccommons.columbia.edu | success | 339 + aip.scitation.org | | 762 + aip.scitation.org | terminal-bad-status | 430 + apps.crossref.org | | 9894 + apps.crossref.org | no-pdf-link | 9886 + apps.euskadi.eus | | 242 + apps.euskadi.eus | no-pdf-link | 240 + arxiv.org | | 44889 + arxiv.org | success | 28781 + arxiv.org | spn2-backoff | 7975 + arxiv.org | terminal-bad-status | 4508 + arxiv.org | spn2-cdx-lookup-failure | 2010 + arxiv.org | redirect-loop | 619 + arxiv.org | no-pdf-link | 242 + arxiv.org | spn2-error | 236 + asa.scitation.org | | 356 + asa.scitation.org | terminal-bad-status | 299 + asmedigitalcollection.asme.org | | 240 + assets.cureus.com | | 336 + assets.cureus.com | success | 335 + assets.researchsquare.com | | 1042 + assets.researchsquare.com | success | 993 + av.tib.eu | | 205 + av.tib.eu | no-pdf-link | 203 + bibliographie.uni-tuebingen.de | | 213 + bibliographie.uni-tuebingen.de | no-pdf-link | 211 + biorxiv.org | redirect-loop | 217 + biorxiv.org | | 217 + books.openedition.org | | 691 + books.openedition.org | no-pdf-link | 687 + boris.unibe.ch | | 525 + boris.unibe.ch | success | 466 + bridges.monash.edu | | 663 + bridges.monash.edu | no-pdf-link | 647 + brill.com | | 860 + brill.com | success | 434 + chemrxiv.org | | 201 + classiques-garnier.com | | 242 + content.iospress.com | | 325 + content.iospress.com | link-loop | 247 + core.tdar.org | | 216 + core.tdar.org | no-pdf-link | 211 + cyberleninka.ru | | 646 + cyberleninka.ru | success | 620 + d197for5662m48.cloudfront.net | | 263 + d197for5662m48.cloudfront.net | success | 262 + dergipark.org.tr | | 891 + dergipark.org.tr | success | 526 + dergipark.org.tr | no-pdf-link | 261 + digi.ub.uni-heidelberg.de | | 427 + digi.ub.uni-heidelberg.de | no-pdf-link | 427 + direct.mit.edu | | 268 + direct.mit.edu | no-pdf-link | 208 + dl.acm.org | | 1719 + dl.acm.org | success | 829 + dl.acm.org | no-pdf-link | 546 + dl.acm.org | terminal-bad-status | 205 + dlc.library.columbia.edu | | 385 + dlc.library.columbia.edu | terminal-bad-status | 319 + doi.ala.org.au | | 724 + doi.ala.org.au | no-pdf-link | 721 + doi.apa.org | | 214 + doi.org | | 3390 + doi.org | terminal-bad-status | 2938 + doi.org | redirect-loop | 233 + doi.org | spn2-wayback-error | 208 + doi.usp.org | | 325 + doi.usp.org | no-pdf-link | 324 + downloads.hindawi.com | | 1439 + downloads.hindawi.com | success | 1436 + du.diva-portal.org | | 589 + du.diva-portal.org | success | 586 + econtents.bc.unicamp.br | | 310 + econtents.bc.unicamp.br | success | 310 + ediss.uni-goettingen.de | | 728 + ediss.uni-goettingen.de | success | 425 + elibrary.kdpu.edu.ua | | 907 + elibrary.kdpu.edu.ua | bad-redirect | 712 + elibrary.ru | | 925 + elibrary.ru | terminal-bad-status | 492 + elibrary.ru | bad-redirect | 230 + elibrary.vdi-verlag.de | | 393 + elifesciences.org | | 296 + elifesciences.org | success | 276 + europepmc.org | | 3024 + europepmc.org | success | 2541 + europepmc.org | terminal-bad-status | 463 + figshare.com | | 493 + figshare.com | no-pdf-link | 440 + files.osf.io | | 883 + files.osf.io | success | 686 + fjfsdata01prod.blob.core.windows.net | | 3869 + fjfsdata01prod.blob.core.windows.net | success | 3818 + ieeexplore.ieee.org | | 10854 + ieeexplore.ieee.org | gateway-timeout | 5495 + ieeexplore.ieee.org | spn2-backoff | 1662 + ieeexplore.ieee.org | no-pdf-link | 1417 + ieeexplore.ieee.org | success | 1410 + ieeexplore.ieee.org | redirect-loop | 768 + iiif.crossasia.org | | 7608 + iiif.crossasia.org | no-pdf-link | 7568 + ikee.lib.auth.gr | | 450 + ikee.lib.auth.gr | success | 332 + ins.journals.ekb.eg | | 212 + iopscience.iop.org | | 268 + jamanetwork.com | | 333 + journals.aps.org | | 414 + journals.asm.org | | 242 + journals.flvc.org | | 245 + journals.flvc.org | success | 242 + journals.healio.com | | 755 + journals.healio.com | terminal-bad-status | 668 + journals.lincoln.ac.nz | | 244 + journals.lincoln.ac.nz | success | 239 + journals.lww.com | | 1772 + journals.lww.com | link-loop | 1425 + journals.lww.com | spn2-backoff | 209 + journals.openedition.org | | 1192 + journals.openedition.org | redirect-loop | 467 + journals.openedition.org | success | 451 + journals.plos.org | | 771 + journals.plos.org | success | 750 + journals.ub.uni-heidelberg.de | | 787 + journals.ub.uni-heidelberg.de | success | 741 + kazanmedjournal.ru | | 240 + kazanmedjournal.ru | success | 231 + kiss.kstudy.com | | 219 + kiss.kstudy.com | no-pdf-link | 218 + kluwerlawonline.com | | 444 + kluwerlawonline.com | link-loop | 402 + libraetd.lib.virginia.edu | | 362 + libraetd.lib.virginia.edu | no-pdf-link | 361 + link.springer.com | | 305 + linkinghub.elsevier.com | | 568 + linkinghub.elsevier.com | spn2-backoff | 545 + ltu-figshare-repo.s3.aarnet.edu.au | | 269 + ltu-figshare-repo.s3.aarnet.edu.au | success | 268 + mausamjournal.imd.gov.in | | 202 + mdpi-res.com | | 8892 + mdpi-res.com | success | 8863 + mededpublish.org | | 1900 + mededpublish.org | no-pdf-link | 1900 + meetingorganizer.copernicus.org | | 276 + meetingorganizer.copernicus.org | no-pdf-link | 271 + muse.jhu.edu | | 1047 + muse.jhu.edu | terminal-bad-status | 755 + muse.jhu.edu | link-loop | 203 + online.ucpress.edu | | 358 + online.ucpress.edu | link-loop | 212 + onlinelibrary.wiley.com | | 5813 + onlinelibrary.wiley.com | terminal-bad-status | 4587 + onlinelibrary.wiley.com | spn2-wayback-error | 614 + onlinelibrary.wiley.com | blocked-cookie | 381 + open.library.ubc.ca | | 206 + opendata.uni-halle.de | | 1768 + opendata.uni-halle.de | success | 1215 + opendata.uni-halle.de | wrong-mimetype | 260 + opendata2.uni-halle.de | | 206 + opg.optica.org | | 205 + osf.io | | 2949 + osf.io | no-pdf-link | 2404 + osf.io | spn2-backoff | 299 + papers.ssrn.com | | 3962 + papers.ssrn.com | link-loop | 3800 + peerj.com | | 273 + preprints.jmir.org | | 275 + preprints.jmir.org | cdx-error | 255 + publikationen.bibliothek.kit.edu | | 213 + publons.com | | 593 + publons.com | no-pdf-link | 590 + pubs.acs.org | | 2288 + pubs.acs.org | terminal-bad-status | 1841 + pubs.acs.org | spn2-wayback-error | 210 + pubs.rsc.org | | 1698 + pubs.rsc.org | bad-redirect | 811 + pubs.rsc.org | link-loop | 352 + pubs.rsc.org | success | 307 + radiopaedia.org | | 220 + read.dukeupress.edu | | 303 + repositories.lib.utexas.edu | | 1570 + repositories.lib.utexas.edu | bad-redirect | 513 + repositories.lib.utexas.edu | spn2-backoff | 383 + repositories.lib.utexas.edu | gateway-timeout | 379 + repositories.lib.utexas.edu | terminal-bad-status | 282 + repository.uj.ac.za | | 489 + repository.uj.ac.za | no-pdf-link | 365 + repository.unsworks.unsw.edu.au | | 397 + repository.urosario.edu.co | | 2429 + repository.urosario.edu.co | success | 1648 + repository.urosario.edu.co | bad-redirect | 613 + rex.libraries.wsu.edu | no-pdf-link | 241 + rex.libraries.wsu.edu | | 241 + rsdjournal.org | | 208 + rsdjournal.org | success | 208 + s3-ap-southeast-2.amazonaws.com | | 282 + s3-ap-southeast-2.amazonaws.com | success | 277 + s3-eu-west-1.amazonaws.com | | 4615 + s3-eu-west-1.amazonaws.com | success | 4593 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | | 240 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success | 237 + sage.figshare.com | | 415 + sage.figshare.com | no-pdf-link | 385 + scholar.dkyobobook.co.kr | | 512 + scholar.dkyobobook.co.kr | no-pdf-link | 509 + scholarlypublishingcollective.org | | 287 + scholarworks.gsu.edu | | 1132 + scholarworks.gsu.edu | success | 1000 + scholarworks.iupui.edu | | 205 + scholarworks.umass.edu | | 417 + scholarworks.umass.edu | success | 400 + sciencescholar.us | | 404 + secure.jbs.elsevierhealth.com | | 727 + secure.jbs.elsevierhealth.com | terminal-bad-status | 722 + tandf.figshare.com | | 354 + tandf.figshare.com | no-pdf-link | 342 + unsworks.unsw.edu.au | | 408 + unsworks.unsw.edu.au | spn2-cdx-lookup-failure | 342 + valep.vc.univie.ac.at | no-pdf-link | 737 + valep.vc.univie.ac.at | | 737 + watermark.silverchair.com | | 1604 + watermark.silverchair.com | success | 1598 + wayf.switch.ch | | 215 + wayf.switch.ch | no-pdf-link | 213 + www.ahajournals.org | | 438 + www.ahajournals.org | no-pdf-link | 306 + www.ahbps.org | | 316 + www.ahbps.org | success | 312 + www.atenaeditora.com.br | | 390 + www.atenaeditora.com.br | terminal-bad-status | 333 + www.atlantis-press.com | | 914 + www.atlantis-press.com | success | 901 + www.atsjournals.org | | 1245 + www.atsjournals.org | success | 1189 + www.biorxiv.org | | 712 + www.biorxiv.org | success | 670 + www.bloomsburycollections.com | | 982 + www.bloomsburycollections.com | terminal-bad-status | 566 + www.cahiers-clsl.ch | | 305 + www.cahiers-clsl.ch | success | 298 + www.cairn.info | | 1799 + www.cairn.info | no-pdf-link | 662 + www.cairn.info | link-loop | 487 + www.cairn.info | success | 355 + www.cairn.info | terminal-bad-status | 267 + www.cambridge.org | | 3258 + www.cambridge.org | no-pdf-link | 1682 + www.cambridge.org | success | 682 + www.cambridge.org | bad-redirect | 404 + www.cambridge.org | link-loop | 302 + www.dbpia.co.kr | | 763 + www.dbpia.co.kr | no-pdf-link | 443 + www.dbpia.co.kr | redirect-loop | 287 + www.degruyter.com | | 12655 + www.degruyter.com | no-pdf-link | 9112 + www.degruyter.com | success | 2898 + www.degruyter.com | spn2-backoff | 507 + www.dib.ie | | 1381 + www.dib.ie | no-pdf-link | 1378 + www.dovepress.com | | 231 + www.dovepress.com | success | 216 + www.e-manuscripta.ch | | 767 + www.e-manuscripta.ch | success | 399 + www.e-periodica.ch | | 1406 + www.e-periodica.ch | no-pdf-link | 1402 + www.e-rara.ch | no-pdf-link | 251 + www.e-rara.ch | | 251 + www.editoracientifica.org | no-pdf-link | 205 + www.editoracientifica.org | | 205 + www.elgaronline.com | | 427 + www.elibrary.ru | | 616 + www.elibrary.ru | terminal-bad-status | 364 + www.elibrary.ru | no-pdf-link | 216 + www.emerald.com | | 862 + www.emerald.com | no-pdf-link | 724 + www.endocrine-abstracts.org | | 1907 + www.endocrine-abstracts.org | no-pdf-link | 1905 + www.eurekaselect.com | | 285 + www.eurekaselect.com | link-loop | 246 + www.even3.com.br | | 233 + www.frontiersin.org | | 585 + www.frontiersin.org | spn2-backoff | 436 + www.humankineticslibrary.com | no-pdf-link | 207 + www.humankineticslibrary.com | | 207 + www.igi-global.com | | 1600 + www.igi-global.com | no-pdf-link | 1199 + www.igi-global.com | bad-redirect | 258 + www.inderscience.com | | 385 + www.inderscience.com | no-pdf-link | 365 + www.inderscienceonline.com | | 202 + www.ingentaconnect.com | | 450 + www.ingentaconnect.com | no-pdf-link | 260 + www.jstage.jst.go.jp | | 1248 + www.jstage.jst.go.jp | success | 870 + www.karger.com | | 313 + www.liebertpub.com | | 271 + www.liebertpub.com | no-pdf-link | 241 + www.nicecjournal.co.uk | | 274 + www.nicecjournal.co.uk | success | 274 + www.nomos-elibrary.de | | 1771 + www.nomos-elibrary.de | no-pdf-link | 788 + www.nomos-elibrary.de | redirect-loop | 506 + www.nomos-elibrary.de | bad-redirect | 207 + www.osti.gov | | 381 + www.osti.gov | link-loop | 326 + www.persee.fr | | 277 + www.preprints.org | | 225 + www.preprints.org | success | 225 + www.protocols.io | | 770 + www.protocols.io | success | 485 + www.repository.cam.ac.uk | | 510 + www.repository.cam.ac.uk | bad-redirect | 213 + www.research-collection.ethz.ch | | 416 + www.research-collection.ethz.ch | bad-redirect | 249 + www.researchsquare.com | | 1121 + www.researchsquare.com | bad-redirect | 985 + www.scielo.br | | 828 + www.scielo.br | success | 641 + www.sciencedirect.com | | 8567 + www.sciencedirect.com | terminal-bad-status | 5773 + www.sciencedirect.com | spn2-wayback-error | 1590 + www.sciencedirect.com | no-pdf-link | 576 + www.sciencedirect.com | spn2-backoff | 479 + www.sciendo.com | | 257 + www.sciendo.com | success | 222 + www.scitepress.org | | 381 + www.scitepress.org | no-pdf-link | 377 + www.spiedigitallibrary.org | | 1061 + www.spiedigitallibrary.org | bad-redirect | 571 + www.spiedigitallibrary.org | gateway-timeout | 233 + www.tandfonline.com | | 4934 + www.tandfonline.com | no-pdf-link | 2088 + www.tandfonline.com | terminal-bad-status | 1282 + www.tandfonline.com | blocked-cookie | 757 + www.tandfonline.com | redirect-loop | 488 + www.tandfonline.com | spn2-wayback-error | 202 + www.taylorfrancis.com | | 3979 + www.taylorfrancis.com | link-loop | 1928 + www.taylorfrancis.com | no-pdf-link | 1840 + www.techniques-ingenieur.fr | | 354 + www.techniques-ingenieur.fr | no-pdf-link | 353 + www.thieme-connect.de | | 1987 + www.thieme-connect.de | no-pdf-link | 949 + www.thieme-connect.de | link-loop | 869 + www.tib.eu | no-pdf-link | 315 + www.tib.eu | | 315 + www.un-ilibrary.org | no-pdf-link | 352 + www.un-ilibrary.org | | 352 + www.worldscientific.com | | 668 + www.worldscientific.com | no-pdf-link | 629 + www.zora.uzh.ch | | 318 + zenodo.org | | 46585 + zenodo.org | no-pdf-link | 29519 + zenodo.org | success | 14768 + zenodo.org | terminal-bad-status | 810 + zenodo.org | wrong-mimetype | 691 + zenodo.org | spn2-cdx-lookup-failure | 395 + zenodo.org | spn2-backoff | 294 + zivahub.uct.ac.za | | 1909 + zivahub.uct.ac.za | no-pdf-link | 1880 + zop.zb.uzh.ch | | 228 + zop.zb.uzh.ch | success | 217 + | | 365582 + | success | 141497 38.7% + | no-pdf-link | 120852 33.0% + | terminal-bad-status | 31900 8.7% + | spn2-backoff | 16979 4.6% + | link-loop | 13624 3.7% + | bad-redirect | 8736 + | redirect-loop | 7405 + | gateway-timeout | 6997 + | spn2-cdx-lookup-failure | 5146 + | spn2-wayback-error | 3708 + | wrong-mimetype | 2158 + | blocked-cookie | 1942 + | spn2-error:blocked-url | 1733 + | wayback-error | 1063 + | spn2-error | 647 + | spn2-error:500 | 265 + | cdx-error | 257 +(383 rows) + +---- + +365k in 7 days is about 52k a day, which is about expected. Around 5-7% need +retries. + +important changes: +- biorxiv.org: needs fix and then retries +- academic.oup.com: should probably skip +- apps.crossref.org: need to handle this in code +- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/` +- doi.org: investigate redirect-loop and terminal-bad-status +- osf.io: not getting PDFs +- papers.ssrn.com: why are these attempted? +- publons.com: not getting PDFs; special case these? +- www.sciencedirect.com: not working at all? + +smaller: +- bridges.monash.edu: fix, then retry? +- dl.acm.org: some broader retries? +- figshare.com: still some attempts, but almost all no-pdf-link +- onlinelibrary.wiley.com: getting blocked broadly? +- www.endocrine-abstracts.org: HTML content? +- www.igi-global.com: no-pdf-link diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + + table_name | table_size | indexes_size | total_size + ------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 62 GB | 44 GB | 106 GB + "public"."ingest_request" | 51 GB | 50 GB | 101 GB + "public"."ingest_file_result" | 44 GB | 52 GB | 96 GB + "public"."file_meta" | 39 GB | 39 GB | 78 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 23 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 27 GB | 3089 MB | 30 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7469 MB | 66 MB | 7535 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes + (16 rows) + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + + total_count | total_size + -------------+----------------- + 198175106 | 282695671015403 + (1 row) + + 198 million files, 282 TBytes. + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + + mimetype | count + ---------------------------------------------------------------------------+----------- + application/pdf | 197021437 + text/html | 830331 + application/octet-stream | 186669 + application/xml | 42170 + application/xhtml+xml | 38207 + text/plain | 16471 + application/jats+xml | 10385 + application/gzip | 6681 + | 6032 + application/postscript | 4916 + image/jpeg | 4522 + application/vnd.ms-powerpoint | 1672 + application/msword | 946 + application/x-bzip2 | 891 + image/png | 659 + application/vnd.openxmlformats-officedocument.wordprocessingml.document | 440 + application/x-dosexec | 404 + image/gif | 395 + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet | 382 + application/x-compress | 274 + video/mp4 | 218 + application/zip | 131 + application/CDFV2-unknown | 99 + application/mac-binhex40 | 79 + application/zlib | 68 + text/x-tex | 44 + application/vnd.openxmlformats-officedocument.presentationml.presentation | 39 + text/x-php | 37 + image/g3fax | 35 + text/rtf | 33 + (30 rows) + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + + count + ------- + 12800 + (1 row) + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + + unique_sha1 | total + -------------+----------- + 137283420 | 172140506 + (1 row) + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + + mimetype | count + ----------------------------+----------- + application/pdf | 157465613 + warc/revisit | 11337336 + text/html | 1137208 + application/octet-stream | 950380 + text/xml | 528965 + unk | 253294 + application/postscript | 81130 + application/save | 81069 + binary/octet-stream | 68942 + application/x-download | 42717 + application/download | 40628 + image/pdf | 39904 + text/plain | 36445 + application/force-download | 24148 + multipart/form-data | 10972 + application | 5409 + application/x-octetstream | 5192 + application/x-msdownload | 3854 + .pdf | 3518 + application/x-pdf | 3061 + application/octet | 1792 + pdf | 1757 + application/binary | 1399 + file | 1373 + file/unknown | 1345 + application/pdf' | 1196 + application/octetstream | 1087 + application/unknown | 1005 + 0 | 773 + text/pdf | 729 + (30 rows) + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + + total_files + ------------- + 129001717 + (1 row) + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + + status_code | count + -------------+----------- + 200 | 120797098 + 500 | 8198783 + -4 | 5802 + 503 | 36 + (4 rows) + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + + grobid_version | count + ----------------------+---------- + 0.7.0-131-gdd0251d9f | 60469462 + 0.5.5-fatcat | 47472904 + | 12665498 + 0.7.0-104-gbeebd9a6b | 189243 + (4 rows) + + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + + unique_sha1 | total + -------------+--------- + 2868825 | 2887834 + (1 row) + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + ingest_type | link_source | count + -------------+-----------------+---------- + pdf | oai | 51185088 + pdf | unpaywall | 43932525 + pdf | doi | 43852308 + pdf | mag | 43701948 + pdf | doaj | 6534341 + html | doaj | 3987669 + pdf | arxiv | 2784589 + pdf | pmc | 2439181 + pdf | dblp | 631716 + html | doi | 126699 + xml | doaj | 23066 + pdf | cnki_covid19 | 2034 + pdf | spn | 1026 + pdf | wanfang_covid19 | 975 + html | spn | 65 + xml | spn | 2 + xml | doi | 1 + (17 rows) + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | ingest_request_source | count + -------------+-----------------+-------------------------+---------- + pdf | oai | metha-bulk | 51185088 + pdf | unpaywall | unpaywall | 43932525 + pdf | mag | mag-corpus | 43701948 + pdf | doi | fatcat-changelog | 24742500 + pdf | doi | fatcat-ingest | 15592121 + pdf | doaj | doaj | 6484737 + html | doaj | doaj | 3987468 + pdf | doi | fatcat-ingest-container | 3515873 + pdf | pmc | fatcat-ingest-container | 2028825 + pdf | arxiv | fatcat-ingest | 1984766 + pdf | arxiv | fatcat-changelog | 799793 + pdf | dblp | dblp | 631716 + pdf | pmc | fatcat-ingest | 297980 + html | doi | fatcat-ingest | 121508 + pdf | pmc | fatcat-changelog | 112376 + pdf | doaj | fatcat-changelog | 47181 + xml | doaj | doaj | 23066 + html | doi | fatcat-changelog | 5129 + pdf | doaj | fatcat-ingest | 2423 + pdf | cnki_covid19 | scrape-covid19 | 2034 + pdf | doi | savepapernow-web | 1814 + pdf | spn | savepapernow-web | 1026 + pdf | wanfang_covid19 | scrape-covid19 | 975 + html | doaj | fatcat-ingest | 201 + html | spn | savepapernow-web | 65 + html | doi | savepapernow-web | 62 + pdf | arxiv | fatcat-ingest-container | 26 + pdf | arxiv | savepapernow-web | 4 + xml | spn | savepapernow-web | 2 + xml | doi | savepapernow-web | 1 + (30 rows) + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + + ingest_type | link_source | count + -------------+-------------+-------- + pdf | mag | 167653 + pdf | doaj | 81517 + pdf | oai | 15282 + html | doaj | 1791 + pdf | unpaywall | 270 + pdf | doi | 22 + (6 rows) + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + + ingest_type | link_source | attempts | hits | fraction + -------------+-----------------+----------+----------+---------- + pdf | oai | 51185088 | 16024068 | 0.313 + pdf | unpaywall | 43932525 | 36045446 | 0.820 + pdf | doi | 43852308 | 14956080 | 0.341 + pdf | mag | 43701948 | 32768484 | 0.750 + pdf | doaj | 6534341 | 4704066 | 0.720 + html | doaj | 3987669 | 778165 | 0.195 + pdf | arxiv | 2784589 | 2419941 | 0.869 + pdf | pmc | 2439181 | 1897671 | 0.778 + pdf | dblp | 631716 | 305142 | 0.483 + html | doi | 126699 | 75754 | 0.598 + xml | doaj | 23066 | 10381 | 0.450 + pdf | cnki_covid19 | 2034 | 0 | 0.000 + pdf | spn | 1026 | 778 | 0.758 + pdf | wanfang_covid19 | 975 | 764 | 0.784 + html | spn | 65 | 13 | 0.200 + xml | spn | 2 | 1 | 0.500 + xml | doi | 1 | 0 | 0.000 + (17 rows) + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + + ingest_type | status | count + -------------+-------------------------------+---------- + pdf | success | 94887295 + pdf | no-pdf-link | 33960080 + pdf | no-capture | 20893916 + pdf | terminal-bad-status | 6973765 + pdf | redirect-loop | 5775175 + pdf | link-loop | 4095424 + pdf | skip-url-blocklist | 4037518 + pdf | blocked-cookie | 3508762 + html | wrong-scope | 1783694 + pdf | wrong-mimetype | 1379673 + html | success | 853762 + pdf | gateway-timeout | 635170 + html | no-capture | 381283 + pdf | wayback-content-error | 356694 + pdf | cdx-error | 347700 + pdf | null-body | 336166 + html | unknown-scope | 321874 + html | html-resource-no-capture | 294294 + pdf | forbidden | 291127 + pdf | not-found | 274343 + pdf | too-many-redirects | 264494 + component | wrong-mimetype | 196680 + component | spn2-cdx-lookup-failure | 173615 + component | spn2-backoff | 115840 + html | terminal-bad-status | 106264 + html | null-body | 100296 + pdf | wayback-error | 94748 + html | blocked-cookie | 88537 + component | no-capture | 75278 + pdf | empty-blob | 61157 + pdf | bad-redirect | 58680 + pdf | skip-wall | 57751 + pdf | spn2-error:too-many-redirects | 52873 + html | spn2-backoff | 50577 + pdf | remote-server-error | 41282 + pdf | invalid-host-resolution | 38864 + pdf | read-timeout | 37071 + pdf | spn2-cdx-lookup-failure | 34229 + html | wrong-mimetype | 33643 + pdf | spn2-backoff | 32437 + pdf | petabox-error | 31006 + html | wayback-content-error | 28034 + component | spn2-error | 27044 + pdf | spn2-error:unknown | 25810 + component | gateway-timeout | 25215 + pdf | body-too-large | 21721 + html | petabox-error | 18313 + html | empty-blob | 14393 + html | redirect-loop | 13404 + component | blocked-cookie | 12287 + (50 rows) + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + + ingest_type | terminal_status_code | count + -------------+----------------------+---------- + pdf | 200 | 45052391 + pdf | | 26117481 + pdf | 301 | 4814786 + html | 200 | 2684821 + pdf | 403 | 1871088 + pdf | 404 | 1254259 + pdf | 302 | 898728 + pdf | 503 | 867548 + pdf | 401 | 851205 + pdf | 429 | 741869 + pdf | 400 | 624519 + component | | 456915 + html | | 442051 + pdf | 500 | 283700 + component | 200 | 197510 + pdf | 410 | 120647 + pdf | 303 | 107947 + html | 404 | 80114 + pdf | 420 | 26722 + pdf | 502 | 19500 + pdf | 409 | 15499 + html | 429 | 15208 + pdf | 509 | 15167 + pdf | 999 | 12186 + pdf | 202 | 11535 + html | 301 | 10213 + xml | | 10018 + pdf | 307 | 8657 + pdf | 402 | 8338 + pdf | 412 | 8064 + pdf | 308 | 6479 + html | 500 | 4746 + xml | 200 | 2668 + pdf | 520 | 2496 + html | 302 | 2289 + pdf | 521 | 2257 + html | 202 | 2177 + pdf | 206 | 1961 + html | 403 | 1775 + pdf | 504 | 1187 + pdf | 421 | 1148 + html | 303 | 1112 + pdf | 406 | 1109 + pdf | 204 | 772 + pdf | 432 | 745 + pdf | 405 | 633 + html | 400 | 632 + pdf | 426 | 515 + pdf | 508 | 503 + pdf | 505 | 469 + (50 rows) diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt new file mode 100644 index 0000000..0a6254a --- /dev/null +++ b/sql/stats/2022-11-23_table_sizes.txt @@ -0,0 +1,21 @@ +PostgreSQL 13.2 - wbgrp-svc506.us.archive.org +Size: 1.13T + + table_name | table_size | indexes_size | total_size +------------------------------------+------------+--------------+------------ + "public"."crossref" | 459 GB | 10 GB | 470 GB + "public"."grobid" | 98 GB | 13 GB | 112 GB + "public"."cdx" | 63 GB | 45 GB | 108 GB + "public"."ingest_request" | 53 GB | 52 GB | 105 GB + "public"."ingest_file_result" | 46 GB | 55 GB | 100 GB + "public"."file_meta" | 39 GB | 40 GB | 79 GB + "public"."grobid_shadow" | 67 GB | 5455 MB | 73 GB + "public"."pdf_meta" | 24 GB | 7466 MB | 31 GB + "public"."grobid_refs" | 28 GB | 3306 MB | 31 GB + "public"."fatcat_file" | 13 GB | 7314 MB | 20 GB + "public"."shadow" | 9517 MB | 8026 MB | 17 GB + "public"."html_meta" | 7879 MB | 68 MB | 7947 MB + "public"."petabox" | 403 MB | 461 MB | 864 MB + "public"."pdftrio" | 550 MB | 297 MB | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB | 24 kB + "public"."crossref_with_refs" | 0 bytes | 0 bytes | 0 bytes diff --git a/sql/stats/README.md b/sql/stats/README.md new file mode 100644 index 0000000..3161514 --- /dev/null +++ b/sql/stats/README.md @@ -0,0 +1,109 @@ + +## SQL Table Sizes + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + WHERE table_schema = 'public' + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + +## File Metadata + +Counts and total file size: + + SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +Top mimetypes: + + SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + +Missing full metadata: + + SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +## CDX + +Total and unique-by-sha1 counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +mimetype counts: + + SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + +## GROBID + +Counts: + + SELECT COUNT(*) AS total_files FROM grobid; + +Status? + + SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + +What version used? + + SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + +## Petabox + +Counts: + + SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +## Ingests + +Requests by source: + + SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + + SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + +Uncrawled requests by source: + + # TODO: verify this? + SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + WHERE ingest_file_result.base_url IS NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + +Results by source: + + SELECT + ingest_request.ingest_type, + ingest_request.link_source, + COUNT(*) as attempts, + COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits, + ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_request.base_url = ingest_file_result.base_url + AND ingest_request.ingest_type = ingest_file_result.ingest_type + AND ingest_file_result.ingest_type IS NOT NULL + GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + +Ingest result by status: + + SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +Failed ingest by terminal status code: + + SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + diff --git a/sql/table_sizes.md b/sql/table_sizes.md new file mode 100644 index 0000000..3596b2b --- /dev/null +++ b/sql/table_sizes.md @@ -0,0 +1,11 @@ + +## September 2019 + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."cdx" | 31 GB | 27 GB | 58 GB + "public"."file_meta" | 13 GB | 6500 MB | 19 GB + "public"."shadow" | 8303 MB | 9216 MB | 17 GB + "public"."grobid" | 4994 MB | 6678 MB | 11 GB + "public"."fatcat_file" | 5206 MB | 2094 MB | 7300 MB + "public"."petabox" | 403 MB | 594 MB | 997 MB |