From d3638a9fd9ed11fb4484038852f8e02b2f5a7b41 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 22 Mar 2022 16:03:46 -0700 Subject: various ingest/task notes --- notes/ingest/2021-12-13_datasets.md | 53 +++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'notes/ingest/2021-12-13_datasets.md') diff --git a/notes/ingest/2021-12-13_datasets.md b/notes/ingest/2021-12-13_datasets.md index edad789..1df633f 100644 --- a/notes/ingest/2021-12-13_datasets.md +++ b/notes/ingest/2021-12-13_datasets.md @@ -396,3 +396,56 @@ This is after having done a bunch of crawling. | pv -l \ > /srv/sandcrawler/tasks/ingest_dataset_retry_results.json +## Retries (2022-02) + +Finally got things to complete end to end for this batch! + + cat ingest_dataset_retry_results5.json | jq .status -r | sort | uniq -c | sort -nr + 3220 terminal-bad-status + 2120 no-capture + 380 empty-manifest + 264 success-file + 251 success + 126 success-existing + 39 mismatch + 28 error-platform-download + 24 too-many-files + 20 platform-scope + 13 platform-restricted + 13 mismatch-size + 6 too-large-size + 3 transfer-encoding-error + 2 no-platform-match + 2 error-archiveorg-upload + 1 redirect-loop + 1 empty-blob + +Some more URLs to crawl: + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg -v '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | jq .request.base_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.base_url.txt + # 1.00 + # just a single DOI that failed to crawl, for whatever reason + + cat ingest_dataset_retry_results5.json \ + | rg '"no-capture"' \ + | rg '"manifest"' \ + | jq 'select(.status = "no-capture")' -c \ + | rg '"web-' \ + | jq .manifest[].terminal_url -r \ + | pv -l \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt + +These are ready to crawl, in the existing dataset crawl. + + cat /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.txt \ + | sort -u \ + | shuf \ + | awk '{print "F+ " $1}' \ + > /srv/sandcrawler/tasks/dataset_seedlist_retries5.manifest_terminal.schedule + -- cgit v1.2.3