aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-25 16:31:55 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-25 16:31:55 -0700
commit93b8d7364359ba7f1a4adbc022286abcdf310834 (patch)
tree72fb9200a88d023b0a53be55f803f1fd213206c3 /sql
parentc4ce91bd78bd5e5144f97dcbb891492c21af0e31 (diff)
downloadsandcrawler-93b8d7364359ba7f1a4adbc022286abcdf310834.tar.gz
sandcrawler-93b8d7364359ba7f1a4adbc022286abcdf310834.zip
SQL commands for re-trying PDF ingests
Diffstat (limited to 'sql')
-rw-r--r--sql/ingest_again.md158
1 files changed, 158 insertions, 0 deletions
diff --git a/sql/ingest_again.md b/sql/ingest_again.md
new file mode 100644
index 0000000..3b4b990
--- /dev/null
+++ b/sql/ingest_again.md
@@ -0,0 +1,158 @@
+
+## re-ingest some broken
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'spn2-%'
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ ) TO '/grande/snapshots/reingest_spn2-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/grande/snapshots/reingest_cdx-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'cdx-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source != 'fatcat-changelog'
+ AND ingest_request.ingest_request_source != 'fatcat-ingest')
+ ) TO '/grande/snapshots/reingest_cdx-error_bulk_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'wayback-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ ) TO '/grande/snapshots/reingest_wayback-error_current.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'gateway-timeout'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/grande/snapshots/reingest_gateway-timeout.rows.json';
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.hit = false
+ AND ingest_file_result.status like 'petabox-error'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '12 day'::INTERVAL
+ AND (ingest_request.ingest_request_source = 'fatcat-changelog'
+ OR ingest_request.ingest_request_source = 'fatcat-ingest')
+ ) TO '/grande/snapshots/reingest_petabox-error_current.rows.json';
+
+Transform:
+
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_spn2-error_current.rows.json | shuf > reingest_spn2-error_current.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_current.rows.json | shuf > reingest_cdx-error_current.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_cdx-error_bulk_current.rows.json | shuf > reingest_cdx-error_bulk_current.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_wayback-error_current.rows.json | shuf > reingest_wayback-error_current.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_gateway-timeout.rows.json | shuf > reingest_gateway-timeout.json
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_petabox-error_current.rows.json | shuf > reingest_petabox-error_current.json
+
+Push to kafka (shuffled):
+
+ cat reingest_spn2-error_current.json reingest_cdx-error_current.json reingest_wayback-error_current.json reingest_petabox-error_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+ cat reingest_gateway-timeout.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p 0
+
+ cat reingest_cdx-error_bulk_current.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+
+Push to kafka (not shuffled):
+
+ cat reingest_spn2-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_cdx-error_bulk_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ cat reingest_wayback-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_gateway-timeout.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+ cat reingest_petabox-error_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## just recent fatcat-ingest
+
+ COPY (
+ SELECT row_to_json(ingest_request.*) FROM ingest_request
+ LEFT JOIN ingest_file_result ON ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_file_result.ingest_type = 'pdf'
+ AND ingest_file_result.updated < NOW() - '1 hour'::INTERVAL
+ -- AND ingest_file_result.updated > NOW() - '24 hour'::INTERVAL
+ AND ingest_file_result.updated > NOW() - '7 day'::INTERVAL
+ AND ingest_file_result.hit = false
+ AND (ingest_file_result.status like 'spn2-%'
+ OR ingest_file_result.status like 'cdx-error'
+ OR ingest_file_result.status like 'gateway-timeout'
+ OR ingest_file_result.status like 'wayback-error'
+ )
+ AND ingest_file_result.status != 'spn2-error:invalid-url-syntax'
+ AND ingest_file_result.status != 'spn2-error:spn2-error:filesize-limit'
+ AND ingest_request.ingest_request_source = 'fatcat-ingest'
+ ) TO '/grande/snapshots/reingest_fatcat_current.rows.json';
+
+ # note: shuf
+ ./scripts/ingestrequest_row2json.py /grande/snapshots/reingest_fatcat_current.rows.json | shuf > reingest_fatcat_current.json
+
+ cat reingest_fatcat_current.json | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1
+
+## specific domains
+
+protocols.io:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.17504/protocols.io%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;
+
+biorxiv/medrxiv:
+
+ SELECT ingest_file_result.ingest_type, ingest_file_result.status, COUNT(*)
+ FROM ingest_file_result
+ LEFT JOIN ingest_request
+ ON ingest_file_result.ingest_type = ingest_request.ingest_type
+ AND ingest_file_result.base_url = ingest_request.base_url
+ WHERE ingest_request.ingest_type = 'pdf'
+ AND ingest_request.base_url LIKE '%10.1101/20%'
+ GROUP BY ingest_file_result.ingest_type, ingest_file_result.status
+ ORDER BY COUNT DESC
+ LIMIT 20;