diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:12:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-17 17:12:41 -0700 |
commit | c8a7683f1ac1f42bd9e4c312ae54c792b01392ec (patch) | |
tree | 021d4b5dceefcf8aa07af7e5d0a977e48fc74971 /notes/ingest | |
parent | 0c924003291ea1a41cf0543463d2b9ce43d123fd (diff) | |
download | sandcrawler-c8a7683f1ac1f42bd9e4c312ae54c792b01392ec.tar.gz sandcrawler-c8a7683f1ac1f42bd9e4c312ae54c792b01392ec.zip |
notes on 2020-09 re-ingest passes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2020-09_reingest.md | 197 |
1 files changed, 197 insertions, 0 deletions
diff --git a/notes/ingest/2020-09_reingest.md b/notes/ingest/2020-09_reingest.md new file mode 100644 index 0000000..ec4e536 --- /dev/null +++ b/notes/ingest/2020-09_reingest.md @@ -0,0 +1,197 @@ + +Goal: re-bulk-ingest some older existing crawls which hung on errors like +`cdx-error` or `wayback-error`, indicating that ingest might actually succeed +on retry. + +Sources: +- unpaywall (again) +- doi (ingest, changelog, etc) +- mag +- oai + +## DOI + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 25; + + status | count + -------------------------------------+--------- + no-pdf-link | 8304582 + success | 3461708 + no-capture | 1881269 + redirect-loop | 1851541 + gateway-timeout | 355820 + cdx-error | 341848 + terminal-bad-status | 328650 + skip-url-blocklist | 220474 + spn2-cdx-lookup-failure | 125521 + link-loop | 109352 + wayback-error | 101525 + null-body | 73539 + wrong-mimetype | 53151 + spn-error | 13579 + spn2-error | 6848 + spn2-error:job-failed | 4381 + spn-remote-error | 4180 + other-mimetype | 2305 + petabox-error | 904 + timeout | 710 + spn2-error:soft-time-limit-exceeded | 557 + spn2-error:proxy-error | 437 + spn2-error:browser-running-error | 273 + invalid-host-resolution | 233 + pending | 116 + (25 rows) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_doi_errors_2020-09-03.rows.json'; + => 443421 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +Additional 27,779 success status? Hard to tell because lots of other ingest +running in parallel. + +Live: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'doi' + AND ( + ingest_file_result.status = 'spn-error' OR + ingest_file_result.status = 'spn2-cdx-lookup-failure' OR + ingest_file_result.status = 'spn2-error:job-failed' OR + ingest_file_result.status = 'spn2-error:proxy-error' + ) + ) TO '/grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json'; + => 143984 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_doi_spn_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_doi_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_doi_spn_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + +## Unpaywall (again) + +Bulk: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json'; + => 43912 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_unpaywall_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_unpaywall_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## MAG + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_mag_errors_2020-09-03.rows.json'; + => 188175 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_mag_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_mag_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +## OAI-PMH + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'oai' + AND ( + ingest_file_result.status = 'cdx-error' OR + ingest_file_result.status = 'wayback-error' + ) + ) TO '/grande/snapshots/ingest_oai_errors_2020-09-03.rows.json'; + => 851056 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/ingest_oai_errors_2020-09-03.rows.json | pv -l | shuf > /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json + + cat /grande/snapshots/ingest_oai_errors_2020-09-03.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => done + +--------- + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2020-04-01' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/unpaywall_nocapture_2020-05-04.rows.json'; + |