diff options
Diffstat (limited to 'sql')
| -rw-r--r-- | sql/Makefile | 35 | ||||
| -rw-r--r-- | sql/dump_reingest_bulk.sql | 31 | ||||
| -rw-r--r-- | sql/dump_reingest_old.sql | 36 | ||||
| -rw-r--r-- | sql/dump_reingest_quarterly.sql | 32 | ||||
| -rw-r--r-- | sql/dump_reingest_spn.sql | 25 | ||||
| -rw-r--r-- | sql/dump_reingest_terminalstatus.sql | 34 | ||||
| -rw-r--r-- | sql/dump_reingest_weekly.sql | 27 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 1 | ||||
| -rwxr-xr-x | sql/reingest_bulk.sh | 19 | ||||
| -rwxr-xr-x | sql/reingest_old.sh | 19 | ||||
| -rwxr-xr-x | sql/reingest_quarterly.sh | 2 | ||||
| -rwxr-xr-x | sql/reingest_spn.sh | 2 | ||||
| -rwxr-xr-x | sql/reingest_terminalstatus_forcerecrawl.sh | 19 | ||||
| -rwxr-xr-x | sql/reingest_weekly.sh | 4 | ||||
| -rw-r--r-- | sql/stats/2022-04-26_stats.txt | 432 | ||||
| -rw-r--r-- | sql/stats/2022-04-27_crawl_changelog.txt | 191 | ||||
| -rw-r--r-- | sql/stats/2022-05-11_crawl_changelog.txt | 410 | ||||
| -rw-r--r-- | sql/stats/2022-09-06_stats.txt | 438 | ||||
| -rw-r--r-- | sql/stats/2022-11-23_table_sizes.txt | 21 | 
19 files changed, 1751 insertions, 27 deletions
| diff --git a/sql/Makefile b/sql/Makefile new file mode 100644 index 0000000..860addb --- /dev/null +++ b/sql/Makefile @@ -0,0 +1,35 @@ + +SHELL=/bin/bash -euo pipefail +TODAY ?= $(shell date --iso --utc) +DATADIR ?= /srv/sandcrawler/tasks/$(TODAY) +DATESLUG ?= $(shell date +%Y-%m-%d.%H%M%S) +DATABASE_URL ?= sandcrawler + +.PHONY: help +help: ## Print info about all commands +	@echo "Commands:" +	@echo +	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "    \033[01;32m%-20s\033[0m %s\n", $$1, $$2}' + +.PHONY: create_datadir +create_datadir: +	mkdir -p $(DATADIR)/ +	sudo chmod a+rw $(DATADIR)/ + +$(DATADIR)/.DB_DUMP: +	sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=crossref sandcrawler > $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip +	mv $(DATADIR)/sandcrawler_${DATESLUG}.pgdump.wip $(DATADIR)/sandcrawler_${DATESLUG}.pgdump +	touch $@ + +.PHONY: database-snapshot +database-snapshot: create_datadir $(DATADIR)/.DB_DUMP ## Create SQL database snapshot +	@echo + +$(DATADIR)/.DB_UPLOADED: $(DATADIR)/.DB_DUMP +	ia upload --checksum sandcrawler_sqldump_$(TODAY) ia_sqldump_item_readme.md --remote-name=README.md -m collection:webgroup-internal-backups -m mediatype:data -m creator:"Internet Archive Web Group" -m date:$(TODAY) -m title:"Sandcrawler SQL Database Snapshot ($(TODAY))" +	ia upload --checksum sandcrawler_sqldump_$(TODAY) $(DATADIR)/sandcrawler_*.pgdump +	touch $@ + +.PHONY: upload-database-snapshot +upload-database-snapshot: create_datadir database-snapshot $(DATADIR)/.DB_UPLOADED ## Upload database snapshot to archive.org +	@echo diff --git a/sql/dump_reingest_bulk.sql b/sql/dump_reingest_bulk.sql new file mode 100644 index 0000000..698db7a --- /dev/null +++ b/sql/dump_reingest_bulk.sql @@ -0,0 +1,31 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        (ingest_request.ingest_type = 'pdf' +            OR ingest_request.ingest_type = 'html') +        AND ingest_file_result.hit = false +        AND ingest_request.created < NOW() - '24 hour'::INTERVAL +        AND ingest_request.created > NOW() - '181 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest') +        AND ( +            ingest_file_result.status like 'spn2-%' +            OR ingest_file_result.status like 'cdx-error' +            OR ingest_file_result.status like 'petabox-error' +        ) +        AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' +        AND ingest_file_result.status != 'spn2-error:filesize-limit' +        AND ingest_file_result.status != 'spn2-error:not-found' +        AND ingest_file_result.status != 'spn2-error:blocked-url' +        AND ingest_file_result.status != 'spn2-error:too-many-redirects' +        AND ingest_file_result.status != 'spn2-error:network-authentication-required' +        AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_bulk_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_old.sql b/sql/dump_reingest_old.sql new file mode 100644 index 0000000..7473420 --- /dev/null +++ b/sql/dump_reingest_old.sql @@ -0,0 +1,36 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        ingest_file_result.hit = false +        AND ingest_request.created < NOW() - '6 day'::INTERVAL +        -- AND ingest_request.created > NOW() - '181 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest' +             OR ingest_request.ingest_request_source = 'fatcat-ingest-container' +             OR ingest_request.ingest_request_source = 'unpaywall' +             OR ingest_request.ingest_request_source = 'arxiv' +             OR ingest_request.ingest_request_source = 'pmc' +             OR ingest_request.ingest_request_source = 'doaj' +             OR ingest_request.ingest_request_source = 'dblp') +        AND ( +            ingest_file_result.status like 'spn2-%' +            -- OR ingest_file_result.status like 'no-capture' +            -- OR ingest_file_result.status like 'cdx-error' +            -- OR ingest_file_result.status like 'petabox-error' +        ) +        AND ingest_file_result.status != 'spn2-error:invalid-url-syntax' +        AND ingest_file_result.status != 'spn2-error:filesize-limit' +        AND ingest_file_result.status != 'spn2-error:not-found' +        AND ingest_file_result.status != 'spn2-error:blocked-url' +        AND ingest_file_result.status != 'spn2-error:too-many-redirects' +        AND ingest_file_result.status != 'spn2-error:network-authentication-required' +        AND ingest_file_result.status != 'spn2-error:unknown' +) TO '/srv/sandcrawler/tasks/reingest_old_current.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_quarterly.sql b/sql/dump_reingest_quarterly.sql index 725a404..dbeb199 100644 --- a/sql/dump_reingest_quarterly.sql +++ b/sql/dump_reingest_quarterly.sql @@ -1,20 +1,35 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; +  COPY (      SELECT row_to_json(ingest_request.*) FROM ingest_request -    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url -    WHERE ingest_request.ingest_type = 'pdf' +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        (ingest_request.ingest_type = 'pdf' +            OR ingest_request.ingest_type = 'html' +            OR ingest_request.ingest_type = 'xml' +            OR ingest_request.ingest_type = 'component')          AND ingest_file_result.hit = false          AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '91 day'::INTERVAL          AND (ingest_request.ingest_request_source = 'fatcat-changelog' -             OR ingest_request.ingest_request_source = 'fatcat-ingest') +             OR ingest_request.ingest_request_source = 'fatcat-ingest' +             OR ingest_request.ingest_request_source = 'fatcat-ingest-container' +             OR ingest_request.ingest_request_source = 'unpaywall' +             OR ingest_request.ingest_request_source = 'arxiv' +             OR ingest_request.ingest_request_source = 'pmc' +             OR ingest_request.ingest_request_source = 'doaj' +             OR ingest_request.ingest_request_source = 'dblp')          AND (              ingest_file_result.status like 'spn2-%' -            OR ingest_file_result.status like 'cdx-error' -            OR ingest_file_result.status like 'wayback-error' -            OR ingest_file_result.status like 'wayback-content-error' -            OR ingest_file_result.status like 'petabox-error' -            OR ingest_file_result.status like 'gateway-timeout' +            OR ingest_file_result.status = 'cdx-error' +            OR ingest_file_result.status = 'wayback-error' +            -- OR ingest_file_result.status = 'wayback-content-error' +            OR ingest_file_result.status = 'petabox-error' +            OR ingest_file_result.status = 'gateway-timeout' +            OR ingest_file_result.status = 'no-capture'          )          AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'          AND ingest_file_result.status != 'spn2-error:filesize-limit' @@ -29,3 +44,4 @@ COPY (  --      AND (ingest_request.ingest_request_source != 'fatcat-changelog'  --           AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/dump_reingest_spn.sql b/sql/dump_reingest_spn.sql index 6ef08c1..a83125c 100644 --- a/sql/dump_reingest_spn.sql +++ b/sql/dump_reingest_spn.sql @@ -1,19 +1,28 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; +  COPY (      SELECT row_to_json(ingest_request.*) FROM ingest_request -    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url -    WHERE ingest_request.ingest_type = 'pdf' +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        (ingest_request.ingest_type = 'pdf' +            OR ingest_request.ingest_type = 'html' +            OR ingest_request.ingest_type = 'xml' +            OR ingest_request.ingest_type = 'component')          AND ingest_file_result.hit = false          AND ingest_request.created < NOW() - '6 hour'::INTERVAL          AND ingest_request.created > NOW() - '180 day'::INTERVAL          AND ingest_request.ingest_request_source = 'savepapernow-web'          AND (              ingest_file_result.status like 'spn2-%' -            -- OR ingest_file_result.status like 'cdx-error' -            -- OR ingest_file_result.status like 'wayback-error' -            -- OR ingest_file_result.status like 'wayback-content-error' -            OR ingest_file_result.status like 'petabox-error' -            -- OR ingest_file_result.status like 'gateway-timeout' +            -- OR ingest_file_result.status = 'cdx-error' +            -- OR ingest_file_result.status = 'wayback-error' +            -- OR ingest_file_result.status = 'wayback-content-error' +            OR ingest_file_result.status = 'petabox-error' +            -- OR ingest_file_result.status = 'gateway-timeout' +            OR ingest_file_result.status = 'no-capture'          )          AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'          AND ingest_file_result.status != 'spn2-error:filesize-limit' @@ -23,3 +32,5 @@ COPY (          AND ingest_file_result.status != 'spn2-error:network-authentication-required'          AND ingest_file_result.status != 'spn2-error:unknown'  ) TO '/srv/sandcrawler/tasks/reingest_spn.rows.json'; + +ROLLBACK; diff --git a/sql/dump_reingest_terminalstatus.sql b/sql/dump_reingest_terminalstatus.sql new file mode 100644 index 0000000..b72a096 --- /dev/null +++ b/sql/dump_reingest_terminalstatus.sql @@ -0,0 +1,34 @@ + +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; + +COPY ( +    SELECT row_to_json(ingest_request.*) FROM ingest_request +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        ingest_file_result.hit = false +        AND ingest_request.created < NOW() - '72 hour'::INTERVAL +        AND ingest_request.created > NOW() - '10 day'::INTERVAL +        AND (ingest_request.ingest_request_source = 'fatcat-changelog' +             OR ingest_request.ingest_request_source = 'fatcat-ingest') +        AND ingest_file_result.status = 'terminal-bad-status' +        AND ( +             ingest_file_result.terminal_status_code = 500 +             OR ingest_file_result.terminal_status_code = 502 +             OR ingest_file_result.terminal_status_code = 503 +             OR ingest_file_result.terminal_status_code = 429 +             OR ingest_file_result.terminal_status_code = 404 +        ) +	AND ( +		ingest_request.base_url LIKE 'https://doi.org/10.3390/%' +		OR ingest_request.base_url LIKE 'https://doi.org/10.1103/%' +		OR ingest_request.base_url LIKE 'https://doi.org/10.1155/%' +	) +) TO '/srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json'; + +-- bulk re-tries would be: +--      AND (ingest_request.ingest_request_source != 'fatcat-changelog' +--           AND ingest_request.ingest_request_source != 'fatcat-ingest') + +ROLLBACK; diff --git a/sql/dump_reingest_weekly.sql b/sql/dump_reingest_weekly.sql index 65800eb..a019938 100644 --- a/sql/dump_reingest_weekly.sql +++ b/sql/dump_reingest_weekly.sql @@ -1,20 +1,30 @@ +BEGIN TRANSACTION ISOLATION LEVEL SERIALIZABLE READ ONLY DEFERRABLE; +  COPY (      SELECT row_to_json(ingest_request.*) FROM ingest_request -    LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url -    WHERE ingest_request.ingest_type = 'pdf' +    LEFT JOIN ingest_file_result ON +        ingest_file_result.base_url = ingest_request.base_url +        AND ingest_file_result.ingest_type = ingest_request.ingest_type +    WHERE +        (ingest_request.ingest_type = 'pdf' +            OR ingest_request.ingest_type = 'html' +            OR ingest_request.ingest_type = 'xml' +            OR ingest_request.ingest_type = 'component')          AND ingest_file_result.hit = false          AND ingest_request.created < NOW() - '8 hour'::INTERVAL          AND ingest_request.created > NOW() - '8 day'::INTERVAL          AND (ingest_request.ingest_request_source = 'fatcat-changelog' -             OR ingest_request.ingest_request_source = 'fatcat-ingest') +             OR ingest_request.ingest_request_source = 'fatcat-ingest' +             OR ingest_request.ingest_request_source = 'fatcat-ingest-container')          AND (              ingest_file_result.status like 'spn2-%' -            -- OR ingest_file_result.status like 'cdx-error' -            -- OR ingest_file_result.status like 'wayback-error' -            -- OR ingest_file_result.status like 'wayback-content-error' -            OR ingest_file_result.status like 'petabox-error' -            -- OR ingest_file_result.status like 'gateway-timeout' +            -- OR ingest_file_result.status = 'cdx-error' +            -- OR ingest_file_result.status = 'wayback-error' +            -- OR ingest_file_result.status = 'wayback-content-error' +            OR ingest_file_result.status = 'petabox-error' +            -- OR ingest_file_result.status = 'gateway-timeout' +            OR ingest_file_result.status = 'no-capture'          )          AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'          AND ingest_file_result.status != 'spn2-error:filesize-limit' @@ -29,3 +39,4 @@ COPY (  --      AND (ingest_request.ingest_request_source != 'fatcat-changelog'  --           AND ingest_request.ingest_request_source != 'fatcat-ingest') +ROLLBACK; diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 23a935e..33dba66 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -149,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (      PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)  );  CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); +CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);  CREATE TABLE IF NOT EXISTS ingest_file_result (      ingest_type             TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), diff --git a/sql/reingest_bulk.sh b/sql/reingest_bulk.sh new file mode 100755 index 0000000..d39a171 --- /dev/null +++ b/sql/reingest_bulk.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_bulk.sql + +cd ../python +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_bulk_current.rows.json \ +    > /srv/sandcrawler/tasks/reingest_bulk_current.json + +cat /srv/sandcrawler/tasks/reingest_bulk_current.json \ +    | shuf \ +    | head -n1000000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + diff --git a/sql/reingest_old.sh b/sql/reingest_old.sh new file mode 100755 index 0000000..96e5416 --- /dev/null +++ b/sql/reingest_old.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_old.sql + +cd ../python +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/reingest_old_current.rows.json \ +    > /srv/sandcrawler/tasks/reingest_old_current.json + +cat /srv/sandcrawler/tasks/reingest_old_current.json \ +    | shuf \ +    | head -n1000000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_quarterly.sh b/sql/reingest_quarterly.sh index 20fd82b..8a2996c 100755 --- a/sql/reingest_quarterly.sh +++ b/sql/reingest_quarterly.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_quarterly_current.json \      | shuf \      | head -n120000 \      | jq . -c \ -    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 diff --git a/sql/reingest_spn.sh b/sql/reingest_spn.sh index 6fb1e4b..c693a64 100755 --- a/sql/reingest_spn.sh +++ b/sql/reingest_spn.sh @@ -15,5 +15,5 @@ cat /srv/sandcrawler/tasks/reingest_spn.json \      | shuf \      | head -n60000 \      | jq . -c \ -    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-priority -p -1 diff --git a/sql/reingest_terminalstatus_forcerecrawl.sh b/sql/reingest_terminalstatus_forcerecrawl.sh new file mode 100755 index 0000000..5cb6d51 --- /dev/null +++ b/sql/reingest_terminalstatus_forcerecrawl.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e              # fail on error +set -u              # fail if variable not set in substitution +set -o pipefail     # fail if part of a '|' command fails + +sudo -u postgres psql sandcrawler < dump_reingest_terminalstatus.sql + +cd ../python +sudo -u sandcrawler pipenv run \ +    ./scripts/ingestrequest_row2json.py --force-recrawl /srv/sandcrawler/tasks/reingest_terminalstatus_current.rows.json \ +    > /srv/sandcrawler/tasks/reingest_terminalstatus_current.json + +cat /srv/sandcrawler/tasks/reingest_terminalstatus_current.json \ +    | shuf \ +    | head -n100000 \ +    | jq . -c \ +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + diff --git a/sql/reingest_weekly.sh b/sql/reingest_weekly.sh index 04ce39d..d2e2444 100755 --- a/sql/reingest_weekly.sh +++ b/sql/reingest_weekly.sh @@ -13,7 +13,7 @@ sudo -u sandcrawler pipenv run \  cat /srv/sandcrawler/tasks/reingest_weekly_current.json \      | shuf \ -    | head -n60000 \ +    | head -n80000 \      | jq . -c \ -    | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 +    | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 diff --git a/sql/stats/2022-04-26_stats.txt b/sql/stats/2022-04-26_stats.txt new file mode 100644 index 0000000..bd20c5c --- /dev/null +++ b/sql/stats/2022-04-26_stats.txt @@ -0,0 +1,432 @@ + +## SQL Table Sizes + +    SELECT +        table_name, +        pg_size_pretty(table_size) AS table_size, +        pg_size_pretty(indexes_size) AS indexes_size, +        pg_size_pretty(total_size) AS total_size +      FROM ( +          SELECT +              table_name, +              pg_table_size(table_name) AS table_size, +              pg_indexes_size(table_name) AS indexes_size, +              pg_total_relation_size(table_name) AS total_size +          FROM ( +              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name +              FROM information_schema.tables +              WHERE table_schema = 'public' +          ) AS all_tables +          ORDER BY total_size DESC +      ) AS pretty_sizes; + +                 table_name             | table_size | indexes_size | total_size  +    ------------------------------------+------------+--------------+------------ +     "public"."crossref"                | 416 GB     | 10 GB        | 426 GB +     "public"."grobid"                  | 98 GB      | 13 GB        | 112 GB +     "public"."cdx"                     | 58 GB      | 41 GB        | 99 GB +     "public"."ingest_request"          | 50 GB      | 48 GB        | 98 GB +     "public"."ingest_file_result"      | 42 GB      | 48 GB        | 90 GB +     "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB +     "public"."file_meta"               | 37 GB      | 34 GB        | 71 GB +     "public"."pdf_meta"                | 21 GB      | 7386 MB      | 29 GB +     "public"."grobid_refs"             | 23 GB      | 2516 MB      | 26 GB +     "public"."fatcat_file"             | 13 GB      | 7314 MB      | 20 GB +     "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB +     "public"."html_meta"               | 3015 MB    | 31 MB        | 3046 MB +     "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB +     "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB +     "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB +     "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes +    (16 rows) + + +## File Metadata + +Counts and total file size: + +    SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +     total_count |   total_size +    -------------+----------------- +       192402128 | 271919997557597 +    (1 row) + +    # 271,919,997,557,597 -> ~272 TByte + +Top mimetypes: + +    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + +                                     mimetype                                  |   count    +    ---------------------------------------------------------------------------+----------- +     application/pdf                                                           | 191760695 +     text/html                                                                 |    330351 +     application/octet-stream                                                  |    186696 +     application/xml                                                           |     42170 +     application/xhtml+xml                                                     |     31470 +     text/plain                                                                |     16449 +     application/jats+xml                                                      |      6902 +     application/gzip                                                          |      6681 +                                                                               |      6033 +     application/postscript                                                    |      4916 +     image/jpeg                                                                |      2901 +     application/vnd.ms-powerpoint                                             |      1672 +     application/msword                                                        |       934 +     application/x-bzip2                                                       |       891 +     image/png                                                                 |       476 +     application/x-dosexec                                                     |       404 +     image/gif                                                                 |       395 +     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |       374 +     application/vnd.openxmlformats-officedocument.spreadsheetml.sheet         |       294 +     application/x-compress                                                    |       274 +     video/mp4                                                                 |       150 +     application/zip                                                           |       131 +     application/CDFV2-unknown                                                 |        99 +     application/mac-binhex40                                                  |        79 +     application/zlib                                                          |        68 +     text/x-tex                                                                |        44 +     application/vnd.openxmlformats-officedocument.presentationml.presentation |        39 +     text/x-php                                                                |        37 +     image/g3fax                                                               |        35 +     text/rtf                                                                  |        33 +    (30 rows) + +Missing full metadata: + +    SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +     count +    ------- +     12831 +    (1 row) + +## CDX + +Total and unique-by-sha1 counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +     unique_sha1 |   total +    -------------+----------- +       130732381 | 162760251 +    (1 row) + +mimetype counts: + +    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + +              mimetype          |   count +    ----------------------------+----------- +     application/pdf            | 149749828 +     warc/revisit               |  10437210 +     application/octet-stream   |    733161 +     text/html                  |    642992 +     text/xml                   |    525483 +     unk                        |    217642 +     application/postscript     |     81127 +     application/save           |     81023 +     binary/octet-stream        |     67938 +     application/x-download     |     41137 +     image/pdf                  |     39712 +     application/download       |     37153 +     text/plain                 |     36342 +     application/force-download |     21496 +     multipart/form-data        |      9792 +     application                |      5366 +     application/x-octetstream  |      5166 +     application/x-msdownload   |      3851 +     .pdf                       |      3445 +     application/x-pdf          |      3018 +     pdf                        |      1618 +     file                       |      1370 +     application/binary         |      1354 +     file/unknown               |      1345 +     application/pdf'           |      1196 +     application/octetstream    |      1047 +     application/unknown        |      1001 +     0                          |       773 +     text/pdf                   |       729 +     application/blob           |       673 +    (30 rows) + +## GROBID + +Counts: + +    SELECT COUNT(*) AS total_files FROM grobid; + +     total_files +    ------------- +       123669603 +    (1 row) + + +Status? + +    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + +     status_code |   count +    -------------+----------- +             200 | 115668412 +             500 |   7995428 +              -4 |      5745 +             503 |        18 +    (4 rows) + + +What version used? + +    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + +        grobid_version    |  count +    ----------------------+---------- +     0.7.0-131-gdd0251d9f | 54780825 +     0.5.5-fatcat         | 48003940 +                          | 12694404 +     0.7.0-104-gbeebd9a6b |   189243 +    (4 rows) + +## Petabox + +Counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +     unique_sha1 |  total +    -------------+--------- +         2868825 | 2887834 +    (1 row) + + +## Ingests + +Requests by source: + +    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type |   link_source   |  count +    -------------+-----------------+---------- +     pdf         | oai             | 51185088 +     pdf         | unpaywall       | 43932525 +     pdf         | mag             | 43701948 +     pdf         | doi             | 40044585 +     pdf         | doaj            |  6016771 +     html        | doaj            |  3648181 +     pdf         | arxiv           |  2676200 +     pdf         | pmc             |  2402453 +     html        | doi             |    41492 +     xml         | doaj            |    20638 +     pdf         | cnki_covid19    |     2034 +     pdf         | wanfang_covid19 |      975 +     pdf         | spn             |      829 +     html        | spn             |       52 +     xml         | doi             |        1 +     xml         | spn             |        1 +    (16 rows) + +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + +     ingest_type |   link_source   |  ingest_request_source  |  count +    -------------+-----------------+-------------------------+---------- +     pdf         | oai             | metha-bulk              | 51185088 +     pdf         | unpaywall       | unpaywall               | 43932525 +     pdf         | mag             | mag-corpus              | 43701948 +     pdf         | doi             | fatcat-changelog        | 20936949 +     pdf         | doi             | fatcat-ingest           | 15590201 +     pdf         | doaj            | doaj                    |  6016771 +     html        | doaj            | doaj                    |  3648181 +     pdf         | doi             | fatcat-ingest-container |  3515873 +     pdf         | pmc             | fatcat-ingest-container |  2028825 +     pdf         | arxiv           | fatcat-ingest           |  1984766 +     pdf         | arxiv           | fatcat-changelog        |   691405 +     pdf         | pmc             | fatcat-ingest           |   297646 +     pdf         | pmc             | fatcat-changelog        |    75982 +     html        | doi             | fatcat-ingest           |    37904 +     xml         | doaj            | doaj                    |    20638 +     html        | doi             | fatcat-changelog        |     3534 +     pdf         | cnki_covid19    | scrape-covid19          |     2034 +     pdf         | doi             | savepapernow-web        |     1562 +     pdf         | wanfang_covid19 | scrape-covid19          |      975 +     pdf         | spn             | savepapernow-web        |      829 +     html        | doi             | savepapernow-web        |       54 +     html        | spn             | savepapernow-web        |       52 +     pdf         | arxiv           | fatcat-ingest-container |       26 +     pdf         | arxiv           | savepapernow-web        |        3 +     xml         | doi             | savepapernow-web        |        1 +     xml         | spn             | savepapernow-web        |        1 +    (26 rows) + +Uncrawled requests by source: + +    # TODO: verify this? +    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +    WHERE ingest_file_result.base_url IS NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + +     ingest_type | link_source |  count +    -------------+-------------+--------- +     pdf         | doaj        | 1619621 +     html        | doaj        | 1208412 +     pdf         | mag         |  167653 +     pdf         | oai         |   15282 +     xml         | doaj        |   11196 +     pdf         | unpaywall   |     270 +     pdf         | doi         |      22 +    (7 rows) + +Results by source: + +    SELECT +        ingest_request.ingest_type, +        ingest_request.link_source, +        COUNT(*) as attempts, +        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,  +        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +        AND ingest_file_result.ingest_type IS NOT NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + +     ingest_type |   link_source   | attempts |   hits   | fraction  +    -------------+-----------------+----------+----------+---------- +     pdf         | oai             | 51185088 | 15968290 |    0.312 +     pdf         | unpaywall       | 43932525 | 32618045 |    0.742 +     pdf         | mag             | 43701948 | 32662926 |    0.747 +     pdf         | doi             | 40044738 | 10925369 |    0.273 +     pdf         | doaj            |  6016771 |  3042569 |    0.506 +     html        | doaj            |  3648181 |   344208 |    0.094 +     pdf         | arxiv           |  2676206 |  2269708 |    0.848 +     pdf         | pmc             |  2402453 |  1855679 |    0.772 +     html        | doi             |    41492 |     1739 |    0.042 +     xml         | doaj            |    20638 |     6899 |    0.334 +     pdf         | cnki_covid19    |     2034 |        0 |    0.000 +     pdf         | wanfang_covid19 |      975 |      764 |    0.784 +     pdf         | spn             |      829 |      616 |    0.743 +     html        | spn             |       52 |        7 |    0.135 +     xml         | doi             |        1 |        0 |    0.000 +     xml         | spn             |        1 |        0 |    0.000 +    (16 rows) + +Ingest result by status: + +    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +     ingest_type |             status              |  count +    -------------+---------------------------------+---------- +     pdf         | success                         | 85709322 +     pdf         | no-pdf-link                     | 29713304 +     pdf         | no-capture                      | 26632191 +     pdf         | redirect-loop                   | 10979145 +     pdf         | terminal-bad-status             |  4977000 +     pdf         | link-loop                       |  3434877 +     pdf         | skip-url-blocklist              |  3114258 +     pdf         | blocked-cookie                  |  2156835 +     html        | wrong-scope                     |  1126911 +     pdf         | wrong-mimetype                  |   980546 +     pdf         | gateway-timeout                 |   651562 +     pdf         | spn2-cdx-lookup-failure         |   484016 +     pdf         | spn2-backoff                    |   399382 +     pdf         | cdx-error                       |   373964 +     pdf         | wayback-content-error           |   354370 +     html        | success                         |   345860 +     pdf         | null-body                       |   336182 +     pdf         | spn2-error:500                  |   309755 +     pdf         | forbidden                       |   291175 +     pdf         | not-found                       |   275560 +     pdf         | too-many-redirects              |   262312 +     html        | unknown-scope                   |   230352 +     html        | redirect-loop                   |   226596 +     html        | html-resource-no-capture        |   205646 +     html        | no-capture                      |   164014 +     component   | spn2-cdx-lookup-failure         |   148825 +     component   | wrong-mimetype                  |   130344 +     html        | null-body                       |   100296 +     pdf         | wayback-error                   |    94286 +     pdf         | spn2-wayback-error              |    81365 +     component   | no-capture                      |    75278 +     pdf         | spn2-error                      |    69830 +     pdf         | skip-wall                       |    57744 +     pdf         | spn2-error:too-many-redirects   |    53808 +     pdf         | remote-server-error             |    41286 +     pdf         | petabox-error                   |    38800 +     pdf         | invalid-host-resolution         |    37337 +     pdf         | read-timeout                    |    36872 +     component   | spn2-backoff                    |    33217 +     pdf         | empty-blob                      |    27946 +     component   | spn2-error                      |    24078 +     pdf         | spn2-error:unknown              |    23697 +     component   | gateway-timeout                 |    23139 +     html        | wrong-mimetype                  |    22731 +     html        | wayback-content-error           |    20507 +     pdf         | spn2-error:host-crawling-paused |    19900 +     pdf         | bad-redirect                    |    19183 +     html        | terminal-bad-status             |    13354 +     component   | blocked-cookie                  |    12287 +     component   | spn2-error:500                  |    11271 +    (50 rows) + +Failed ingest by terminal status code: + +    SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + +     ingest_type | terminal_status_code |  count +    -------------+----------------------+---------- +     pdf         |                  200 | 38144779 +     pdf         |                      | 32762240 +     pdf         |                  301 |  9433087 +     html        |                  200 |  1716127 +     pdf         |                  403 |  1416632 +     pdf         |                  302 |  1134668 +     pdf         |                  404 |   888853 +     pdf         |                  401 |   746311 +     pdf         |                  503 |   655894 +     pdf         |                  400 |   531479 +     component   |                      |   337603 +     pdf         |                  500 |   247944 +     html        |                  301 |   224237 +     html        |                      |   167194 +     pdf         |                  303 |   135048 +     component   |                  200 |   130663 +     pdf         |                  429 |    93489 +     pdf         |                  410 |    67392 +     pdf         |                  420 |    26722 +     pdf         |                  502 |    18770 +     pdf         |                  409 |    15152 +     pdf         |                  509 |    15113 +     pdf         |                  999 |    11747 +     html        |                  404 |     9879 +     pdf         |                  307 |     8895 +     pdf         |                  412 |     7053 +     pdf         |                  308 |     6627 +     pdf         |                  202 |     5289 +     xml         |                  200 |     2540 +     html        |                  500 |     2480 +     pdf         |                  520 |     2220 +     pdf         |                  521 |     1844 +     pdf         |                  206 |     1739 +     html        |                  302 |     1407 +     pdf         |                  504 |     1146 +     html        |                  303 |     1123 +     pdf         |                  421 |      986 +     pdf         |                  406 |      938 +     pdf         |                  204 |      498 +     pdf         |                  505 |      468 +     pdf         |                  300 |      436 +     pdf         |                  508 |      422 +     pdf         |                  426 |      405 +     html        |                  429 |      402 +     html        |                  403 |      398 +     pdf         |                  432 |      366 +     component   |                  301 |      294 +     pdf         |                  405 |      210 +     pdf         |                  226 |      166 +     component   |                  302 |      128 +    (50 rows) + diff --git a/sql/stats/2022-04-27_crawl_changelog.txt b/sql/stats/2022-04-27_crawl_changelog.txt new file mode 100644 index 0000000..864abd4 --- /dev/null +++ b/sql/stats/2022-04-27_crawl_changelog.txt @@ -0,0 +1,191 @@ +                domain                |         status          | count   +--------------------------------------+-------------------------+-------- + academic.oup.com                     |                         |   1243 + academic.oup.com                     | spn2-cdx-lookup-failure |    990 + aip.scitation.org                    |                         |    313 + aip.scitation.org                    | spn2-cdx-lookup-failure |    224 + ajps.uomustansiriyah.edu.iq          |                         |    235 + apps.crossref.org                    |                         |   1329 + apps.crossref.org                    | spn2-cdx-lookup-failure |    942 + apps.crossref.org                    | no-pdf-link             |    387 + archaeologydataservice.ac.uk         |                         |    422 + archaeologydataservice.ac.uk         | spn2-cdx-lookup-failure |    289 + arxiv.org                            |                         |   3512 + arxiv.org                            | spn2-cdx-lookup-failure |   2319 + arxiv.org                            | success                 |   1177 + assets.researchsquare.com            |                         |    571 + assets.researchsquare.com            | spn2-cdx-lookup-failure |    322 + assets.researchsquare.com            | success                 |    249 + brill.com                            |                         |    397 + brill.com                            | spn2-cdx-lookup-failure |    265 + cla.berkeley.edu                     |                         |    239 + classiques-garnier.com               |                         |    249 + cyberleninka.ru                      |                         |    340 + cyberleninka.ru                      | spn2-cdx-lookup-failure |    244 + dergipark.org.tr                     |                         |    468 + dergipark.org.tr                     | spn2-cdx-lookup-failure |    333 + dl.acm.org                           |                         |    592 + dl.acm.org                           | spn2-cdx-lookup-failure |    470 + doi.ala.org.au                       |                         |    288 + doi.ala.org.au                       | spn2-cdx-lookup-failure |    220 + doi.org                              |                         |   1107 + doi.org                              | terminal-bad-status     |    679 + doi.org                              | spn2-cdx-lookup-failure |    415 + downloads.hindawi.com                |                         |    279 + downloads.hindawi.com                | success                 |    267 + edbs.uomustansiriyah.edu.iq          |                         |    294 + edbs.uomustansiriyah.edu.iq          | spn2-cdx-lookup-failure |    209 + elibrary.kdpu.edu.ua                 |                         |    320 + elibrary.kdpu.edu.ua                 | spn2-cdx-lookup-failure |    233 + elibrary.ru                          |                         |    722 + elibrary.ru                          | spn2-cdx-lookup-failure |    505 + europepmc.org                        |                         |    986 + europepmc.org                        | spn2-cdx-lookup-failure |    681 + europepmc.org                        | success                 |    291 + figshare.com                         |                         |    377 + figshare.com                         | spn2-cdx-lookup-failure |    328 + fjfsdata01prod.blob.core.windows.net |                         |    255 + fjfsdata01prod.blob.core.windows.net | spn2-cdx-lookup-failure |    216 + hammer.purdue.edu                    |                         |    224 + ieeexplore.ieee.org                  |                         |   3904 + ieeexplore.ieee.org                  | spn2-cdx-lookup-failure |   2654 + ieeexplore.ieee.org                  | gateway-timeout         |    792 + ieeexplore.ieee.org                  | spn2-backoff            |    419 + journals.eco-vector.com              |                         |    428 + journals.eco-vector.com              | spn2-cdx-lookup-failure |    306 + journals.lww.com                     |                         |    727 + journals.lww.com                     | spn2-cdx-lookup-failure |    622 + journals.openedition.org             |                         |    806 + journals.openedition.org             | spn2-cdx-lookup-failure |    554 + journals.plos.org                    |                         |    348 + journals.plos.org                    | spn2-cdx-lookup-failure |    244 + kiss.kstudy.com                      |                         |    226 + kluwerlawonline.com                  |                         |    723 + kluwerlawonline.com                  | spn2-cdx-lookup-failure |    489 + kluwerlawonline.com                  | link-loop               |    203 + linkinghub.elsevier.com              |                         |    401 + linkinghub.elsevier.com              | spn2-backoff            |    342 + mdpi-res.com                         |                         |   1463 + mdpi-res.com                         | success                 |   1337 + muse.jhu.edu                         |                         |    346 + muse.jhu.edu                         | spn2-cdx-lookup-failure |    253 + onepetro.org                         |                         |    363 + onepetro.org                         | spn2-cdx-lookup-failure |    284 + online.ucpress.edu                   |                         |   1620 + online.ucpress.edu                   | spn2-cdx-lookup-failure |   1511 + onlinelibrary.wiley.com              |                         |   2913 + onlinelibrary.wiley.com              | spn2-cdx-lookup-failure |   2109 + onlinelibrary.wiley.com              | terminal-bad-status     |    787 + opendata.uni-halle.de                |                         |    519 + opendata.uni-halle.de                | spn2-cdx-lookup-failure |    343 + osf.io                               |                         |   1554 + osf.io                               | spn2-cdx-lookup-failure |   1350 + papers.ssrn.com                      |                         |   2207 + papers.ssrn.com                      | spn2-cdx-lookup-failure |   1727 + papers.ssrn.com                      | link-loop               |    457 + psycharchives.org                    |                         |    384 + psycharchives.org                    | spn2-cdx-lookup-failure |    283 + publons.com                          |                         |    493 + publons.com                          | spn2-cdx-lookup-failure |    348 + pubs.acs.org                         |                         |   1240 + pubs.acs.org                         | spn2-cdx-lookup-failure |    881 + pubs.acs.org                         | terminal-bad-status     |    298 + pubs.rsc.org                         |                         |    603 + pubs.rsc.org                         | spn2-cdx-lookup-failure |    460 + repositories.lib.utexas.edu          |                         |   1861 + repositories.lib.utexas.edu          | spn2-cdx-lookup-failure |   1288 + repositories.lib.utexas.edu          | terminal-bad-status     |    523 + s3-eu-west-1.amazonaws.com           |                         |    216 + sage.figshare.com                    |                         |    374 + sage.figshare.com                    | spn2-cdx-lookup-failure |    309 + scholar.dkyobobook.co.kr             |                         |    220 + scholarworks.gsu.edu                 |                         |    749 + scholarworks.gsu.edu                 | spn2-cdx-lookup-failure |    577 + tandf.figshare.com                   |                         |    214 + www.atlantis-press.com               |                         |    338 + www.atlantis-press.com               | spn2-cdx-lookup-failure |    214 + www.cairn.info                       |                         |    782 + www.cairn.info                       | spn2-cdx-lookup-failure |    541 + www.cambridge.org                    |                         |   2325 + www.cambridge.org                    | spn2-cdx-lookup-failure |   1787 + www.cambridge.org                    | no-pdf-link             |    300 + www.cell.com                         |                         |    213 + www.concrete.org                     |                         |    476 + www.concrete.org                     | spn2-cdx-lookup-failure |    340 + www.dbpia.co.kr                      |                         |    375 + www.dbpia.co.kr                      | spn2-cdx-lookup-failure |    275 + www.degruyter.com                    |                         |   3849 + www.degruyter.com                    | spn2-cdx-lookup-failure |   2969 + www.degruyter.com                    | no-pdf-link             |    712 + www.dib.ie                           |                         |   1100 + www.dib.ie                           | spn2-cdx-lookup-failure |   1038 + www.e-periodica.ch                   |                         |    821 + www.e-periodica.ch                   | spn2-cdx-lookup-failure |    620 + www.e-periodica.ch                   | no-pdf-link             |    201 + www.elibrary.ru                      |                         |    401 + www.elibrary.ru                      | spn2-cdx-lookup-failure |    281 + www.emerald.com                      |                         |    390 + www.emerald.com                      | spn2-cdx-lookup-failure |    275 + www.eurekaselect.com                 |                         |    275 + www.frontiersin.org                  |                         |   1266 + www.frontiersin.org                  | spn2-cdx-lookup-failure |   1025 + www.hanspub.org                      |                         |    229 + www.hindawi.com                      |                         |    604 + www.hindawi.com                      | spn2-cdx-lookup-failure |    594 + www.inderscience.com                 |                         |    201 + www.jstage.jst.go.jp                 |                         |   1094 + www.jstage.jst.go.jp                 | spn2-cdx-lookup-failure |    807 + www.jstage.jst.go.jp                 | success                 |    206 + www.mdpi.com                         |                         |   4340 + www.mdpi.com                         | spn2-cdx-lookup-failure |   4258 + www.nomos-elibrary.de                |                         |   2749 + www.nomos-elibrary.de                | spn2-cdx-lookup-failure |   1909 + www.nomos-elibrary.de                | redirect-loop           |    819 + www.osti.gov                         |                         |    275 + www.oxfordhandbooks.com              |                         |    248 + www.oxfordhandbooks.com              | spn2-cdx-lookup-failure |    224 + www.pdcnet.org                       |                         |    217 + www.researchsquare.com               |                         |    483 + www.researchsquare.com               | spn2-cdx-lookup-failure |    317 + www.scielo.br                        |                         |    319 + www.scielo.br                        | spn2-cdx-lookup-failure |    222 + www.sciencedirect.com                |                         |   3384 + www.sciencedirect.com                | spn2-cdx-lookup-failure |   3267 + www.spiedigitallibrary.org           |                         |    441 + www.spiedigitallibrary.org           | spn2-cdx-lookup-failure |    327 + www.tandfonline.com                  |                         |   2401 + www.tandfonline.com                  | spn2-cdx-lookup-failure |   1552 + www.tandfonline.com                  | no-pdf-link             |    303 + www.tandfonline.com                  | blocked-cookie          |    250 + www.taylorfrancis.com                |                         |   1232 + www.taylorfrancis.com                | spn2-cdx-lookup-failure |    908 + www.thieme-connect.de                |                         |    520 + www.thieme-connect.de                | spn2-cdx-lookup-failure |    366 + www.worldscientific.com              |                         |    383 + www.worldscientific.com              | spn2-cdx-lookup-failure |    276 + zenodo.org                           |                         |  10625 + zenodo.org                           | spn2-cdx-lookup-failure |   7777 + zenodo.org                           | success                 |   1574 + zenodo.org                           | no-pdf-link             |   1160 + zivahub.uct.ac.za                    |                         |   3428 + zivahub.uct.ac.za                    | spn2-cdx-lookup-failure |   2845 + zivahub.uct.ac.za                    | no-pdf-link             |    583 +                                      |                         | 130491 +                                      | spn2-cdx-lookup-failure |  95169 +                                      | success                 |  13354 +                                      | no-pdf-link             |   9621 +                                      | terminal-bad-status     |   3385 +                                      | spn2-backoff            |   2396 +                                      | redirect-loop           |   2216 +                                      | link-loop               |   1850 +                                      | gateway-timeout         |   1061 +                                      | spn2-error:blocked-url  |    428 +                                      | blocked-cookie          |    415 +                                      | spn2-error              |    246 +(182 rows) + +---- + +The overwhelming thing is `spn2-cdx-lookup-failure`. Should check in after a +week or two, when crawling and retries are running smoothly, and see what +things look like then. diff --git a/sql/stats/2022-05-11_crawl_changelog.txt b/sql/stats/2022-05-11_crawl_changelog.txt new file mode 100644 index 0000000..8d98217 --- /dev/null +++ b/sql/stats/2022-05-11_crawl_changelog.txt @@ -0,0 +1,410 @@ +                             domain                              |         status          | count   +-----------------------------------------------------------------+-------------------------+-------- + academic.oup.com                                                |                         |   2210 + academic.oup.com                                                | no-pdf-link             |   1350 + academic.oup.com                                                | bad-redirect            |    510 + academiccommons.columbia.edu                                    |                         |    379 + academiccommons.columbia.edu                                    | success                 |    339 + aip.scitation.org                                               |                         |    762 + aip.scitation.org                                               | terminal-bad-status     |    430 + apps.crossref.org                                               |                         |   9894 + apps.crossref.org                                               | no-pdf-link             |   9886 + apps.euskadi.eus                                                |                         |    242 + apps.euskadi.eus                                                | no-pdf-link             |    240 + arxiv.org                                                       |                         |  44889 + arxiv.org                                                       | success                 |  28781 + arxiv.org                                                       | spn2-backoff            |   7975 + arxiv.org                                                       | terminal-bad-status     |   4508 + arxiv.org                                                       | spn2-cdx-lookup-failure |   2010 + arxiv.org                                                       | redirect-loop           |    619 + arxiv.org                                                       | no-pdf-link             |    242 + arxiv.org                                                       | spn2-error              |    236 + asa.scitation.org                                               |                         |    356 + asa.scitation.org                                               | terminal-bad-status     |    299 + asmedigitalcollection.asme.org                                  |                         |    240 + assets.cureus.com                                               |                         |    336 + assets.cureus.com                                               | success                 |    335 + assets.researchsquare.com                                       |                         |   1042 + assets.researchsquare.com                                       | success                 |    993 + av.tib.eu                                                       |                         |    205 + av.tib.eu                                                       | no-pdf-link             |    203 + bibliographie.uni-tuebingen.de                                  |                         |    213 + bibliographie.uni-tuebingen.de                                  | no-pdf-link             |    211 + biorxiv.org                                                     | redirect-loop           |    217 + biorxiv.org                                                     |                         |    217 + books.openedition.org                                           |                         |    691 + books.openedition.org                                           | no-pdf-link             |    687 + boris.unibe.ch                                                  |                         |    525 + boris.unibe.ch                                                  | success                 |    466 + bridges.monash.edu                                              |                         |    663 + bridges.monash.edu                                              | no-pdf-link             |    647 + brill.com                                                       |                         |    860 + brill.com                                                       | success                 |    434 + chemrxiv.org                                                    |                         |    201 + classiques-garnier.com                                          |                         |    242 + content.iospress.com                                            |                         |    325 + content.iospress.com                                            | link-loop               |    247 + core.tdar.org                                                   |                         |    216 + core.tdar.org                                                   | no-pdf-link             |    211 + cyberleninka.ru                                                 |                         |    646 + cyberleninka.ru                                                 | success                 |    620 + d197for5662m48.cloudfront.net                                   |                         |    263 + d197for5662m48.cloudfront.net                                   | success                 |    262 + dergipark.org.tr                                                |                         |    891 + dergipark.org.tr                                                | success                 |    526 + dergipark.org.tr                                                | no-pdf-link             |    261 + digi.ub.uni-heidelberg.de                                       |                         |    427 + digi.ub.uni-heidelberg.de                                       | no-pdf-link             |    427 + direct.mit.edu                                                  |                         |    268 + direct.mit.edu                                                  | no-pdf-link             |    208 + dl.acm.org                                                      |                         |   1719 + dl.acm.org                                                      | success                 |    829 + dl.acm.org                                                      | no-pdf-link             |    546 + dl.acm.org                                                      | terminal-bad-status     |    205 + dlc.library.columbia.edu                                        |                         |    385 + dlc.library.columbia.edu                                        | terminal-bad-status     |    319 + doi.ala.org.au                                                  |                         |    724 + doi.ala.org.au                                                  | no-pdf-link             |    721 + doi.apa.org                                                     |                         |    214 + doi.org                                                         |                         |   3390 + doi.org                                                         | terminal-bad-status     |   2938 + doi.org                                                         | redirect-loop           |    233 + doi.org                                                         | spn2-wayback-error      |    208 + doi.usp.org                                                     |                         |    325 + doi.usp.org                                                     | no-pdf-link             |    324 + downloads.hindawi.com                                           |                         |   1439 + downloads.hindawi.com                                           | success                 |   1436 + du.diva-portal.org                                              |                         |    589 + du.diva-portal.org                                              | success                 |    586 + econtents.bc.unicamp.br                                         |                         |    310 + econtents.bc.unicamp.br                                         | success                 |    310 + ediss.uni-goettingen.de                                         |                         |    728 + ediss.uni-goettingen.de                                         | success                 |    425 + elibrary.kdpu.edu.ua                                            |                         |    907 + elibrary.kdpu.edu.ua                                            | bad-redirect            |    712 + elibrary.ru                                                     |                         |    925 + elibrary.ru                                                     | terminal-bad-status     |    492 + elibrary.ru                                                     | bad-redirect            |    230 + elibrary.vdi-verlag.de                                          |                         |    393 + elifesciences.org                                               |                         |    296 + elifesciences.org                                               | success                 |    276 + europepmc.org                                                   |                         |   3024 + europepmc.org                                                   | success                 |   2541 + europepmc.org                                                   | terminal-bad-status     |    463 + figshare.com                                                    |                         |    493 + figshare.com                                                    | no-pdf-link             |    440 + files.osf.io                                                    |                         |    883 + files.osf.io                                                    | success                 |    686 + fjfsdata01prod.blob.core.windows.net                            |                         |   3869 + fjfsdata01prod.blob.core.windows.net                            | success                 |   3818 + ieeexplore.ieee.org                                             |                         |  10854 + ieeexplore.ieee.org                                             | gateway-timeout         |   5495 + ieeexplore.ieee.org                                             | spn2-backoff            |   1662 + ieeexplore.ieee.org                                             | no-pdf-link             |   1417 + ieeexplore.ieee.org                                             | success                 |   1410 + ieeexplore.ieee.org                                             | redirect-loop           |    768 + iiif.crossasia.org                                              |                         |   7608 + iiif.crossasia.org                                              | no-pdf-link             |   7568 + ikee.lib.auth.gr                                                |                         |    450 + ikee.lib.auth.gr                                                | success                 |    332 + ins.journals.ekb.eg                                             |                         |    212 + iopscience.iop.org                                              |                         |    268 + jamanetwork.com                                                 |                         |    333 + journals.aps.org                                                |                         |    414 + journals.asm.org                                                |                         |    242 + journals.flvc.org                                               |                         |    245 + journals.flvc.org                                               | success                 |    242 + journals.healio.com                                             |                         |    755 + journals.healio.com                                             | terminal-bad-status     |    668 + journals.lincoln.ac.nz                                          |                         |    244 + journals.lincoln.ac.nz                                          | success                 |    239 + journals.lww.com                                                |                         |   1772 + journals.lww.com                                                | link-loop               |   1425 + journals.lww.com                                                | spn2-backoff            |    209 + journals.openedition.org                                        |                         |   1192 + journals.openedition.org                                        | redirect-loop           |    467 + journals.openedition.org                                        | success                 |    451 + journals.plos.org                                               |                         |    771 + journals.plos.org                                               | success                 |    750 + journals.ub.uni-heidelberg.de                                   |                         |    787 + journals.ub.uni-heidelberg.de                                   | success                 |    741 + kazanmedjournal.ru                                              |                         |    240 + kazanmedjournal.ru                                              | success                 |    231 + kiss.kstudy.com                                                 |                         |    219 + kiss.kstudy.com                                                 | no-pdf-link             |    218 + kluwerlawonline.com                                             |                         |    444 + kluwerlawonline.com                                             | link-loop               |    402 + libraetd.lib.virginia.edu                                       |                         |    362 + libraetd.lib.virginia.edu                                       | no-pdf-link             |    361 + link.springer.com                                               |                         |    305 + linkinghub.elsevier.com                                         |                         |    568 + linkinghub.elsevier.com                                         | spn2-backoff            |    545 + ltu-figshare-repo.s3.aarnet.edu.au                              |                         |    269 + ltu-figshare-repo.s3.aarnet.edu.au                              | success                 |    268 + mausamjournal.imd.gov.in                                        |                         |    202 + mdpi-res.com                                                    |                         |   8892 + mdpi-res.com                                                    | success                 |   8863 + mededpublish.org                                                |                         |   1900 + mededpublish.org                                                | no-pdf-link             |   1900 + meetingorganizer.copernicus.org                                 |                         |    276 + meetingorganizer.copernicus.org                                 | no-pdf-link             |    271 + muse.jhu.edu                                                    |                         |   1047 + muse.jhu.edu                                                    | terminal-bad-status     |    755 + muse.jhu.edu                                                    | link-loop               |    203 + online.ucpress.edu                                              |                         |    358 + online.ucpress.edu                                              | link-loop               |    212 + onlinelibrary.wiley.com                                         |                         |   5813 + onlinelibrary.wiley.com                                         | terminal-bad-status     |   4587 + onlinelibrary.wiley.com                                         | spn2-wayback-error      |    614 + onlinelibrary.wiley.com                                         | blocked-cookie          |    381 + open.library.ubc.ca                                             |                         |    206 + opendata.uni-halle.de                                           |                         |   1768 + opendata.uni-halle.de                                           | success                 |   1215 + opendata.uni-halle.de                                           | wrong-mimetype          |    260 + opendata2.uni-halle.de                                          |                         |    206 + opg.optica.org                                                  |                         |    205 + osf.io                                                          |                         |   2949 + osf.io                                                          | no-pdf-link             |   2404 + osf.io                                                          | spn2-backoff            |    299 + papers.ssrn.com                                                 |                         |   3962 + papers.ssrn.com                                                 | link-loop               |   3800 + peerj.com                                                       |                         |    273 + preprints.jmir.org                                              |                         |    275 + preprints.jmir.org                                              | cdx-error               |    255 + publikationen.bibliothek.kit.edu                                |                         |    213 + publons.com                                                     |                         |    593 + publons.com                                                     | no-pdf-link             |    590 + pubs.acs.org                                                    |                         |   2288 + pubs.acs.org                                                    | terminal-bad-status     |   1841 + pubs.acs.org                                                    | spn2-wayback-error      |    210 + pubs.rsc.org                                                    |                         |   1698 + pubs.rsc.org                                                    | bad-redirect            |    811 + pubs.rsc.org                                                    | link-loop               |    352 + pubs.rsc.org                                                    | success                 |    307 + radiopaedia.org                                                 |                         |    220 + read.dukeupress.edu                                             |                         |    303 + repositories.lib.utexas.edu                                     |                         |   1570 + repositories.lib.utexas.edu                                     | bad-redirect            |    513 + repositories.lib.utexas.edu                                     | spn2-backoff            |    383 + repositories.lib.utexas.edu                                     | gateway-timeout         |    379 + repositories.lib.utexas.edu                                     | terminal-bad-status     |    282 + repository.uj.ac.za                                             |                         |    489 + repository.uj.ac.za                                             | no-pdf-link             |    365 + repository.unsworks.unsw.edu.au                                 |                         |    397 + repository.urosario.edu.co                                      |                         |   2429 + repository.urosario.edu.co                                      | success                 |   1648 + repository.urosario.edu.co                                      | bad-redirect            |    613 + rex.libraries.wsu.edu                                           | no-pdf-link             |    241 + rex.libraries.wsu.edu                                           |                         |    241 + rsdjournal.org                                                  |                         |    208 + rsdjournal.org                                                  | success                 |    208 + s3-ap-southeast-2.amazonaws.com                                 |                         |    282 + s3-ap-southeast-2.amazonaws.com                                 | success                 |    277 + s3-eu-west-1.amazonaws.com                                      |                         |   4615 + s3-eu-west-1.amazonaws.com                                      | success                 |   4593 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com |                         |    240 + s3-euw1-ap-pe-df-pch-content-store-p.s3.eu-west-1.amazonaws.com | success                 |    237 + sage.figshare.com                                               |                         |    415 + sage.figshare.com                                               | no-pdf-link             |    385 + scholar.dkyobobook.co.kr                                        |                         |    512 + scholar.dkyobobook.co.kr                                        | no-pdf-link             |    509 + scholarlypublishingcollective.org                               |                         |    287 + scholarworks.gsu.edu                                            |                         |   1132 + scholarworks.gsu.edu                                            | success                 |   1000 + scholarworks.iupui.edu                                          |                         |    205 + scholarworks.umass.edu                                          |                         |    417 + scholarworks.umass.edu                                          | success                 |    400 + sciencescholar.us                                               |                         |    404 + secure.jbs.elsevierhealth.com                                   |                         |    727 + secure.jbs.elsevierhealth.com                                   | terminal-bad-status     |    722 + tandf.figshare.com                                              |                         |    354 + tandf.figshare.com                                              | no-pdf-link             |    342 + unsworks.unsw.edu.au                                            |                         |    408 + unsworks.unsw.edu.au                                            | spn2-cdx-lookup-failure |    342 + valep.vc.univie.ac.at                                           | no-pdf-link             |    737 + valep.vc.univie.ac.at                                           |                         |    737 + watermark.silverchair.com                                       |                         |   1604 + watermark.silverchair.com                                       | success                 |   1598 + wayf.switch.ch                                                  |                         |    215 + wayf.switch.ch                                                  | no-pdf-link             |    213 + www.ahajournals.org                                             |                         |    438 + www.ahajournals.org                                             | no-pdf-link             |    306 + www.ahbps.org                                                   |                         |    316 + www.ahbps.org                                                   | success                 |    312 + www.atenaeditora.com.br                                         |                         |    390 + www.atenaeditora.com.br                                         | terminal-bad-status     |    333 + www.atlantis-press.com                                          |                         |    914 + www.atlantis-press.com                                          | success                 |    901 + www.atsjournals.org                                             |                         |   1245 + www.atsjournals.org                                             | success                 |   1189 + www.biorxiv.org                                                 |                         |    712 + www.biorxiv.org                                                 | success                 |    670 + www.bloomsburycollections.com                                   |                         |    982 + www.bloomsburycollections.com                                   | terminal-bad-status     |    566 + www.cahiers-clsl.ch                                             |                         |    305 + www.cahiers-clsl.ch                                             | success                 |    298 + www.cairn.info                                                  |                         |   1799 + www.cairn.info                                                  | no-pdf-link             |    662 + www.cairn.info                                                  | link-loop               |    487 + www.cairn.info                                                  | success                 |    355 + www.cairn.info                                                  | terminal-bad-status     |    267 + www.cambridge.org                                               |                         |   3258 + www.cambridge.org                                               | no-pdf-link             |   1682 + www.cambridge.org                                               | success                 |    682 + www.cambridge.org                                               | bad-redirect            |    404 + www.cambridge.org                                               | link-loop               |    302 + www.dbpia.co.kr                                                 |                         |    763 + www.dbpia.co.kr                                                 | no-pdf-link             |    443 + www.dbpia.co.kr                                                 | redirect-loop           |    287 + www.degruyter.com                                               |                         |  12655 + www.degruyter.com                                               | no-pdf-link             |   9112 + www.degruyter.com                                               | success                 |   2898 + www.degruyter.com                                               | spn2-backoff            |    507 + www.dib.ie                                                      |                         |   1381 + www.dib.ie                                                      | no-pdf-link             |   1378 + www.dovepress.com                                               |                         |    231 + www.dovepress.com                                               | success                 |    216 + www.e-manuscripta.ch                                            |                         |    767 + www.e-manuscripta.ch                                            | success                 |    399 + www.e-periodica.ch                                              |                         |   1406 + www.e-periodica.ch                                              | no-pdf-link             |   1402 + www.e-rara.ch                                                   | no-pdf-link             |    251 + www.e-rara.ch                                                   |                         |    251 + www.editoracientifica.org                                       | no-pdf-link             |    205 + www.editoracientifica.org                                       |                         |    205 + www.elgaronline.com                                             |                         |    427 + www.elibrary.ru                                                 |                         |    616 + www.elibrary.ru                                                 | terminal-bad-status     |    364 + www.elibrary.ru                                                 | no-pdf-link             |    216 + www.emerald.com                                                 |                         |    862 + www.emerald.com                                                 | no-pdf-link             |    724 + www.endocrine-abstracts.org                                     |                         |   1907 + www.endocrine-abstracts.org                                     | no-pdf-link             |   1905 + www.eurekaselect.com                                            |                         |    285 + www.eurekaselect.com                                            | link-loop               |    246 + www.even3.com.br                                                |                         |    233 + www.frontiersin.org                                             |                         |    585 + www.frontiersin.org                                             | spn2-backoff            |    436 + www.humankineticslibrary.com                                    | no-pdf-link             |    207 + www.humankineticslibrary.com                                    |                         |    207 + www.igi-global.com                                              |                         |   1600 + www.igi-global.com                                              | no-pdf-link             |   1199 + www.igi-global.com                                              | bad-redirect            |    258 + www.inderscience.com                                            |                         |    385 + www.inderscience.com                                            | no-pdf-link             |    365 + www.inderscienceonline.com                                      |                         |    202 + www.ingentaconnect.com                                          |                         |    450 + www.ingentaconnect.com                                          | no-pdf-link             |    260 + www.jstage.jst.go.jp                                            |                         |   1248 + www.jstage.jst.go.jp                                            | success                 |    870 + www.karger.com                                                  |                         |    313 + www.liebertpub.com                                              |                         |    271 + www.liebertpub.com                                              | no-pdf-link             |    241 + www.nicecjournal.co.uk                                          |                         |    274 + www.nicecjournal.co.uk                                          | success                 |    274 + www.nomos-elibrary.de                                           |                         |   1771 + www.nomos-elibrary.de                                           | no-pdf-link             |    788 + www.nomos-elibrary.de                                           | redirect-loop           |    506 + www.nomos-elibrary.de                                           | bad-redirect            |    207 + www.osti.gov                                                    |                         |    381 + www.osti.gov                                                    | link-loop               |    326 + www.persee.fr                                                   |                         |    277 + www.preprints.org                                               |                         |    225 + www.preprints.org                                               | success                 |    225 + www.protocols.io                                                |                         |    770 + www.protocols.io                                                | success                 |    485 + www.repository.cam.ac.uk                                        |                         |    510 + www.repository.cam.ac.uk                                        | bad-redirect            |    213 + www.research-collection.ethz.ch                                 |                         |    416 + www.research-collection.ethz.ch                                 | bad-redirect            |    249 + www.researchsquare.com                                          |                         |   1121 + www.researchsquare.com                                          | bad-redirect            |    985 + www.scielo.br                                                   |                         |    828 + www.scielo.br                                                   | success                 |    641 + www.sciencedirect.com                                           |                         |   8567 + www.sciencedirect.com                                           | terminal-bad-status     |   5773 + www.sciencedirect.com                                           | spn2-wayback-error      |   1590 + www.sciencedirect.com                                           | no-pdf-link             |    576 + www.sciencedirect.com                                           | spn2-backoff            |    479 + www.sciendo.com                                                 |                         |    257 + www.sciendo.com                                                 | success                 |    222 + www.scitepress.org                                              |                         |    381 + www.scitepress.org                                              | no-pdf-link             |    377 + www.spiedigitallibrary.org                                      |                         |   1061 + www.spiedigitallibrary.org                                      | bad-redirect            |    571 + www.spiedigitallibrary.org                                      | gateway-timeout         |    233 + www.tandfonline.com                                             |                         |   4934 + www.tandfonline.com                                             | no-pdf-link             |   2088 + www.tandfonline.com                                             | terminal-bad-status     |   1282 + www.tandfonline.com                                             | blocked-cookie          |    757 + www.tandfonline.com                                             | redirect-loop           |    488 + www.tandfonline.com                                             | spn2-wayback-error      |    202 + www.taylorfrancis.com                                           |                         |   3979 + www.taylorfrancis.com                                           | link-loop               |   1928 + www.taylorfrancis.com                                           | no-pdf-link             |   1840 + www.techniques-ingenieur.fr                                     |                         |    354 + www.techniques-ingenieur.fr                                     | no-pdf-link             |    353 + www.thieme-connect.de                                           |                         |   1987 + www.thieme-connect.de                                           | no-pdf-link             |    949 + www.thieme-connect.de                                           | link-loop               |    869 + www.tib.eu                                                      | no-pdf-link             |    315 + www.tib.eu                                                      |                         |    315 + www.un-ilibrary.org                                             | no-pdf-link             |    352 + www.un-ilibrary.org                                             |                         |    352 + www.worldscientific.com                                         |                         |    668 + www.worldscientific.com                                         | no-pdf-link             |    629 + www.zora.uzh.ch                                                 |                         |    318 + zenodo.org                                                      |                         |  46585 + zenodo.org                                                      | no-pdf-link             |  29519 + zenodo.org                                                      | success                 |  14768 + zenodo.org                                                      | terminal-bad-status     |    810 + zenodo.org                                                      | wrong-mimetype          |    691 + zenodo.org                                                      | spn2-cdx-lookup-failure |    395 + zenodo.org                                                      | spn2-backoff            |    294 + zivahub.uct.ac.za                                               |                         |   1909 + zivahub.uct.ac.za                                               | no-pdf-link             |   1880 + zop.zb.uzh.ch                                                   |                         |    228 + zop.zb.uzh.ch                                                   | success                 |    217 +                                                                 |                         | 365582 +                                                                 | success                 | 141497  38.7% +                                                                 | no-pdf-link             | 120852  33.0% +                                                                 | terminal-bad-status     |  31900   8.7% +                                                                 | spn2-backoff            |  16979   4.6% +                                                                 | link-loop               |  13624   3.7% +                                                                 | bad-redirect            |   8736 +                                                                 | redirect-loop           |   7405 +                                                                 | gateway-timeout         |   6997 +                                                                 | spn2-cdx-lookup-failure |   5146 +                                                                 | spn2-wayback-error      |   3708 +                                                                 | wrong-mimetype          |   2158 +                                                                 | blocked-cookie          |   1942 +                                                                 | spn2-error:blocked-url  |   1733 +                                                                 | wayback-error           |   1063 +                                                                 | spn2-error              |    647 +                                                                 | spn2-error:500          |    265 +                                                                 | cdx-error               |    257 +(383 rows) + +---- + +365k in 7 days is about 52k a day, which is about expected. Around 5-7% need +retries. + +important changes: +- biorxiv.org: needs fix and then retries +- academic.oup.com: should probably skip +- apps.crossref.org: need to handle this in code +- arxiv.org: should retry `terminal-bad-status` on PDFs; should also add support to extract PDF link from `/abs/` +- doi.org: investigate redirect-loop and terminal-bad-status +- osf.io: not getting PDFs +- papers.ssrn.com: why are these attempted? +- publons.com: not getting PDFs; special case these? +- www.sciencedirect.com: not working at all? + +smaller: +- bridges.monash.edu: fix, then retry? +- dl.acm.org: some broader retries? +- figshare.com: still some attempts, but almost all no-pdf-link +- onlinelibrary.wiley.com: getting blocked broadly? +- www.endocrine-abstracts.org: HTML content? +- www.igi-global.com: no-pdf-link diff --git a/sql/stats/2022-09-06_stats.txt b/sql/stats/2022-09-06_stats.txt new file mode 100644 index 0000000..be2b30c --- /dev/null +++ b/sql/stats/2022-09-06_stats.txt @@ -0,0 +1,438 @@ + +## SQL Table Sizes + +    SELECT +        table_name, +        pg_size_pretty(table_size) AS table_size, +        pg_size_pretty(indexes_size) AS indexes_size, +        pg_size_pretty(total_size) AS total_size +      FROM ( +          SELECT +              table_name, +              pg_table_size(table_name) AS table_size, +              pg_indexes_size(table_name) AS indexes_size, +              pg_total_relation_size(table_name) AS total_size +          FROM ( +              SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name +              FROM information_schema.tables +              WHERE table_schema = 'public' +          ) AS all_tables +          ORDER BY total_size DESC +      ) AS pretty_sizes; + + +                 table_name             | table_size | indexes_size | total_size  +    ------------------------------------+------------+--------------+------------ +     "public"."crossref"                | 459 GB     | 10 GB        | 470 GB +     "public"."grobid"                  | 98 GB      | 13 GB        | 112 GB +     "public"."cdx"                     | 62 GB      | 44 GB        | 106 GB +     "public"."ingest_request"          | 51 GB      | 50 GB        | 101 GB +     "public"."ingest_file_result"      | 44 GB      | 52 GB        | 96 GB +     "public"."file_meta"               | 39 GB      | 39 GB        | 78 GB +     "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB +     "public"."pdf_meta"                | 23 GB      | 7466 MB      | 31 GB +     "public"."grobid_refs"             | 27 GB      | 3089 MB      | 30 GB +     "public"."fatcat_file"             | 13 GB      | 7314 MB      | 20 GB +     "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB +     "public"."html_meta"               | 7469 MB    | 66 MB        | 7535 MB +     "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB +     "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB +     "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB +     "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes +    (16 rows) + + +## File Metadata + +Counts and total file size: + +    SELECT COUNT(*) as total_count, SUM(size_bytes) as total_size FROM file_meta; + +     total_count |   total_size +    -------------+----------------- +       198175106 | 282695671015403 +    (1 row) + +    198 million files, 282 TBytes. + +Top mimetypes: + +    SELECT mimetype, COUNT(*) FROM file_meta GROUP BY mimetype ORDER BY COUNT DESC LIMIT 30; + +                                     mimetype                                  |   count +    ---------------------------------------------------------------------------+----------- +     application/pdf                                                           | 197021437 +     text/html                                                                 |    830331 +     application/octet-stream                                                  |    186669 +     application/xml                                                           |     42170 +     application/xhtml+xml                                                     |     38207 +     text/plain                                                                |     16471 +     application/jats+xml                                                      |     10385 +     application/gzip                                                          |      6681 +                                                                               |      6032 +     application/postscript                                                    |      4916 +     image/jpeg                                                                |      4522 +     application/vnd.ms-powerpoint                                             |      1672 +     application/msword                                                        |       946 +     application/x-bzip2                                                       |       891 +     image/png                                                                 |       659 +     application/vnd.openxmlformats-officedocument.wordprocessingml.document   |       440 +     application/x-dosexec                                                     |       404 +     image/gif                                                                 |       395 +     application/vnd.openxmlformats-officedocument.spreadsheetml.sheet         |       382 +     application/x-compress                                                    |       274 +     video/mp4                                                                 |       218 +     application/zip                                                           |       131 +     application/CDFV2-unknown                                                 |        99 +     application/mac-binhex40                                                  |        79 +     application/zlib                                                          |        68 +     text/x-tex                                                                |        44 +     application/vnd.openxmlformats-officedocument.presentationml.presentation |        39 +     text/x-php                                                                |        37 +     image/g3fax                                                               |        35 +     text/rtf                                                                  |        33 +    (30 rows) + +Missing full metadata: + +    SELECT COUNT(*) FROM file_meta WHERE sha256hex IS NULL; + +     count  +    ------- +     12800 +    (1 row) + +## CDX + +Total and unique-by-sha1 counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM cdx; + +     unique_sha1 |   total +    -------------+----------- +       137283420 | 172140506 +    (1 row) + +mimetype counts: + +    SELECT mimetype, COUNT(*) FROM cdx GROUP BY mimetype ORDER BY COUNT(*) DESC LIMIT 30; + +              mimetype          |   count +    ----------------------------+----------- +     application/pdf            | 157465613 +     warc/revisit               |  11337336 +     text/html                  |   1137208 +     application/octet-stream   |    950380 +     text/xml                   |    528965 +     unk                        |    253294 +     application/postscript     |     81130 +     application/save           |     81069 +     binary/octet-stream        |     68942 +     application/x-download     |     42717 +     application/download       |     40628 +     image/pdf                  |     39904 +     text/plain                 |     36445 +     application/force-download |     24148 +     multipart/form-data        |     10972 +     application                |      5409 +     application/x-octetstream  |      5192 +     application/x-msdownload   |      3854 +     .pdf                       |      3518 +     application/x-pdf          |      3061 +     application/octet          |      1792 +     pdf                        |      1757 +     application/binary         |      1399 +     file                       |      1373 +     file/unknown               |      1345 +     application/pdf'           |      1196 +     application/octetstream    |      1087 +     application/unknown        |      1005 +     0                          |       773 +     text/pdf                   |       729 +    (30 rows) + +## GROBID + +Counts: + +    SELECT COUNT(*) AS total_files FROM grobid; + +     total_files +    ------------- +       129001717 +    (1 row) + +Status? + +    SELECT status_code, COUNT(*) FROM grobid GROUP BY status_code ORDER BY COUNT DESC LIMIT 25; + +     status_code |   count +    -------------+----------- +             200 | 120797098 +             500 |   8198783 +              -4 |      5802 +             503 |        36 +    (4 rows) + +What version used? + +    SELECT grobid_version, COUNT(*) FROM grobid WHERE status_code = 200 GROUP BY grobid_version ORDER BY COUNT DESC LIMIT 25; + +        grobid_version    |  count +    ----------------------+---------- +     0.7.0-131-gdd0251d9f | 60469462 +     0.5.5-fatcat         | 47472904 +                          | 12665498 +     0.7.0-104-gbeebd9a6b |   189243 +    (4 rows) + + +## Petabox + +Counts: + +    SELECT COUNT(DISTINCT sha1hex) as unique_sha1, COUNT(*) as total FROM petabox; + +     unique_sha1 |  total +    -------------+--------- +         2868825 | 2887834 +    (1 row) + +## Ingests + +Requests by source: + +    SELECT ingest_type, link_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source ORDER BY COUNT DESC LIMIT 25; + +     ingest_type |   link_source   |  count +    -------------+-----------------+---------- +     pdf         | oai             | 51185088 +     pdf         | unpaywall       | 43932525 +     pdf         | doi             | 43852308 +     pdf         | mag             | 43701948 +     pdf         | doaj            |  6534341 +     html        | doaj            |  3987669 +     pdf         | arxiv           |  2784589 +     pdf         | pmc             |  2439181 +     pdf         | dblp            |   631716 +     html        | doi             |   126699 +     xml         | doaj            |    23066 +     pdf         | cnki_covid19    |     2034 +     pdf         | spn             |     1026 +     pdf         | wanfang_covid19 |      975 +     html        | spn             |       65 +     xml         | spn             |        2 +     xml         | doi             |        1 +    (17 rows) + +    SELECT ingest_type, link_source, ingest_request_source, COUNT(*) FROM ingest_request GROUP BY ingest_type, link_source, ingest_request_source ORDER BY COUNT DESC LIMIT 35; + + +     ingest_type |   link_source   |  ingest_request_source  |  count    +    -------------+-----------------+-------------------------+---------- +     pdf         | oai             | metha-bulk              | 51185088 +     pdf         | unpaywall       | unpaywall               | 43932525 +     pdf         | mag             | mag-corpus              | 43701948 +     pdf         | doi             | fatcat-changelog        | 24742500 +     pdf         | doi             | fatcat-ingest           | 15592121 +     pdf         | doaj            | doaj                    |  6484737 +     html        | doaj            | doaj                    |  3987468 +     pdf         | doi             | fatcat-ingest-container |  3515873 +     pdf         | pmc             | fatcat-ingest-container |  2028825 +     pdf         | arxiv           | fatcat-ingest           |  1984766 +     pdf         | arxiv           | fatcat-changelog        |   799793 +     pdf         | dblp            | dblp                    |   631716 +     pdf         | pmc             | fatcat-ingest           |   297980 +     html        | doi             | fatcat-ingest           |   121508 +     pdf         | pmc             | fatcat-changelog        |   112376 +     pdf         | doaj            | fatcat-changelog        |    47181 +     xml         | doaj            | doaj                    |    23066 +     html        | doi             | fatcat-changelog        |     5129 +     pdf         | doaj            | fatcat-ingest           |     2423 +     pdf         | cnki_covid19    | scrape-covid19          |     2034 +     pdf         | doi             | savepapernow-web        |     1814 +     pdf         | spn             | savepapernow-web        |     1026 +     pdf         | wanfang_covid19 | scrape-covid19          |      975 +     html        | doaj            | fatcat-ingest           |      201 +     html        | spn             | savepapernow-web        |       65 +     html        | doi             | savepapernow-web        |       62 +     pdf         | arxiv           | fatcat-ingest-container |       26 +     pdf         | arxiv           | savepapernow-web        |        4 +     xml         | spn             | savepapernow-web        |        2 +     xml         | doi             | savepapernow-web        |        1 +    (30 rows) + +Uncrawled requests by source: + +    # TODO: verify this? +    SELECT ingest_request.ingest_type, ingest_request.link_source, COUNT(*) +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +    WHERE ingest_file_result.base_url IS NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY COUNT DESC LIMIT 35; + + +     ingest_type | link_source | count +    -------------+-------------+-------- +     pdf         | mag         | 167653 +     pdf         | doaj        |  81517 +     pdf         | oai         |  15282 +     html        | doaj        |   1791 +     pdf         | unpaywall   |    270 +     pdf         | doi         |     22 +    (6 rows) + +Results by source: + +    SELECT +        ingest_request.ingest_type, +        ingest_request.link_source, +        COUNT(*) as attempts, +        COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) hits,  +        ROUND(1.0 * COUNT(CASE WHEN ingest_file_result.hit THEN 1 END) / COUNT(*), 3) as fraction +    FROM ingest_request +    LEFT JOIN ingest_file_result +        ON ingest_request.base_url = ingest_file_result.base_url +        AND ingest_request.ingest_type = ingest_file_result.ingest_type +        AND ingest_file_result.ingest_type IS NOT NULL +    GROUP BY ingest_request.ingest_type, ingest_request.link_source ORDER BY attempts DESC LIMIT 35; + + +     ingest_type |   link_source   | attempts |   hits   | fraction +    -------------+-----------------+----------+----------+---------- +     pdf         | oai             | 51185088 | 16024068 |    0.313 +     pdf         | unpaywall       | 43932525 | 36045446 |    0.820 +     pdf         | doi             | 43852308 | 14956080 |    0.341 +     pdf         | mag             | 43701948 | 32768484 |    0.750 +     pdf         | doaj            |  6534341 |  4704066 |    0.720 +     html        | doaj            |  3987669 |   778165 |    0.195 +     pdf         | arxiv           |  2784589 |  2419941 |    0.869 +     pdf         | pmc             |  2439181 |  1897671 |    0.778 +     pdf         | dblp            |   631716 |   305142 |    0.483 +     html        | doi             |   126699 |    75754 |    0.598 +     xml         | doaj            |    23066 |    10381 |    0.450 +     pdf         | cnki_covid19    |     2034 |        0 |    0.000 +     pdf         | spn             |     1026 |      778 |    0.758 +     pdf         | wanfang_covid19 |      975 |      764 |    0.784 +     html        | spn             |       65 |       13 |    0.200 +     xml         | spn             |        2 |        1 |    0.500 +     xml         | doi             |        1 |        0 |    0.000 +    (17 rows) + +Ingest result by status: + +    SELECT ingest_type, status, COUNT(*) FROM ingest_file_result GROUP BY ingest_type, status ORDER BY COUNT DESC LIMIT 50; + +     ingest_type |            status             |  count +    -------------+-------------------------------+---------- +     pdf         | success                       | 94887295 +     pdf         | no-pdf-link                   | 33960080 +     pdf         | no-capture                    | 20893916 +     pdf         | terminal-bad-status           |  6973765 +     pdf         | redirect-loop                 |  5775175 +     pdf         | link-loop                     |  4095424 +     pdf         | skip-url-blocklist            |  4037518 +     pdf         | blocked-cookie                |  3508762 +     html        | wrong-scope                   |  1783694 +     pdf         | wrong-mimetype                |  1379673 +     html        | success                       |   853762 +     pdf         | gateway-timeout               |   635170 +     html        | no-capture                    |   381283 +     pdf         | wayback-content-error         |   356694 +     pdf         | cdx-error                     |   347700 +     pdf         | null-body                     |   336166 +     html        | unknown-scope                 |   321874 +     html        | html-resource-no-capture      |   294294 +     pdf         | forbidden                     |   291127 +     pdf         | not-found                     |   274343 +     pdf         | too-many-redirects            |   264494 +     component   | wrong-mimetype                |   196680 +     component   | spn2-cdx-lookup-failure       |   173615 +     component   | spn2-backoff                  |   115840 +     html        | terminal-bad-status           |   106264 +     html        | null-body                     |   100296 +     pdf         | wayback-error                 |    94748 +     html        | blocked-cookie                |    88537 +     component   | no-capture                    |    75278 +     pdf         | empty-blob                    |    61157 +     pdf         | bad-redirect                  |    58680 +     pdf         | skip-wall                     |    57751 +     pdf         | spn2-error:too-many-redirects |    52873 +     html        | spn2-backoff                  |    50577 +     pdf         | remote-server-error           |    41282 +     pdf         | invalid-host-resolution       |    38864 +     pdf         | read-timeout                  |    37071 +     pdf         | spn2-cdx-lookup-failure       |    34229 +     html        | wrong-mimetype                |    33643 +     pdf         | spn2-backoff                  |    32437 +     pdf         | petabox-error                 |    31006 +     html        | wayback-content-error         |    28034 +     component   | spn2-error                    |    27044 +     pdf         | spn2-error:unknown            |    25810 +     component   | gateway-timeout               |    25215 +     pdf         | body-too-large                |    21721 +     html        | petabox-error                 |    18313 +     html        | empty-blob                    |    14393 +     html        | redirect-loop                 |    13404 +     component   | blocked-cookie                |    12287 +    (50 rows) + +Failed ingest by terminal status code: + +    SELECT ingest_type, terminal_status_code, COUNT(*) FROM ingest_file_result WHERE hit = false GROUP BY ingest_type, terminal_status_code ORDER BY COUNT DESC LIMIT 50; + +     ingest_type | terminal_status_code |  count +    -------------+----------------------+---------- +     pdf         |                  200 | 45052391 +     pdf         |                      | 26117481 +     pdf         |                  301 |  4814786 +     html        |                  200 |  2684821 +     pdf         |                  403 |  1871088 +     pdf         |                  404 |  1254259 +     pdf         |                  302 |   898728 +     pdf         |                  503 |   867548 +     pdf         |                  401 |   851205 +     pdf         |                  429 |   741869 +     pdf         |                  400 |   624519 +     component   |                      |   456915 +     html        |                      |   442051 +     pdf         |                  500 |   283700 +     component   |                  200 |   197510 +     pdf         |                  410 |   120647 +     pdf         |                  303 |   107947 +     html        |                  404 |    80114 +     pdf         |                  420 |    26722 +     pdf         |                  502 |    19500 +     pdf         |                  409 |    15499 +     html        |                  429 |    15208 +     pdf         |                  509 |    15167 +     pdf         |                  999 |    12186 +     pdf         |                  202 |    11535 +     html        |                  301 |    10213 +     xml         |                      |    10018 +     pdf         |                  307 |     8657 +     pdf         |                  402 |     8338 +     pdf         |                  412 |     8064 +     pdf         |                  308 |     6479 +     html        |                  500 |     4746 +     xml         |                  200 |     2668 +     pdf         |                  520 |     2496 +     html        |                  302 |     2289 +     pdf         |                  521 |     2257 +     html        |                  202 |     2177 +     pdf         |                  206 |     1961 +     html        |                  403 |     1775 +     pdf         |                  504 |     1187 +     pdf         |                  421 |     1148 +     html        |                  303 |     1112 +     pdf         |                  406 |     1109 +     pdf         |                  204 |      772 +     pdf         |                  432 |      745 +     pdf         |                  405 |      633 +     html        |                  400 |      632 +     pdf         |                  426 |      515 +     pdf         |                  508 |      503 +     pdf         |                  505 |      469 +    (50 rows) diff --git a/sql/stats/2022-11-23_table_sizes.txt b/sql/stats/2022-11-23_table_sizes.txt new file mode 100644 index 0000000..0a6254a --- /dev/null +++ b/sql/stats/2022-11-23_table_sizes.txt @@ -0,0 +1,21 @@ +PostgreSQL 13.2 - wbgrp-svc506.us.archive.org +Size: 1.13T + +             table_name             | table_size | indexes_size | total_size  +------------------------------------+------------+--------------+------------ + "public"."crossref"                | 459 GB     | 10 GB        | 470 GB + "public"."grobid"                  | 98 GB      | 13 GB        | 112 GB + "public"."cdx"                     | 63 GB      | 45 GB        | 108 GB + "public"."ingest_request"          | 53 GB      | 52 GB        | 105 GB + "public"."ingest_file_result"      | 46 GB      | 55 GB        | 100 GB + "public"."file_meta"               | 39 GB      | 40 GB        | 79 GB + "public"."grobid_shadow"           | 67 GB      | 5455 MB      | 73 GB + "public"."pdf_meta"                | 24 GB      | 7466 MB      | 31 GB + "public"."grobid_refs"             | 28 GB      | 3306 MB      | 31 GB + "public"."fatcat_file"             | 13 GB      | 7314 MB      | 20 GB + "public"."shadow"                  | 9517 MB    | 8026 MB      | 17 GB + "public"."html_meta"               | 7879 MB    | 68 MB        | 7947 MB + "public"."petabox"                 | 403 MB     | 461 MB       | 864 MB + "public"."pdftrio"                 | 550 MB     | 297 MB       | 847 MB + "public"."ingest_fileset_platform" | 8192 bytes | 16 kB        | 24 kB + "public"."crossref_with_refs"      | 0 bytes    | 0 bytes      | 0 bytes | 
