diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-30 09:33:24 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-30 09:33:24 -0700 |
commit | ccb2c72c170d6736af675734906c8957ee176a8b (patch) | |
tree | 3cb039bc11f77e0adc5e4cf384a69faa172bc074 /notes/ingest | |
parent | 899eaa8a03728f0df1390fa5768bbdd4ba559a7f (diff) | |
download | sandcrawler-ccb2c72c170d6736af675734906c8957ee176a8b.tar.gz sandcrawler-ccb2c72c170d6736af675734906c8957ee176a8b.zip |
2021-07 unpaywall crawl wrap-up notes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2021-07_unpaywall.md | 120 |
1 files changed, 108 insertions, 12 deletions
diff --git a/notes/ingest/2021-07_unpaywall.md b/notes/ingest/2021-07_unpaywall.md index 5826771..8b6ac09 100644 --- a/notes/ingest/2021-07_unpaywall.md +++ b/notes/ingest/2021-07_unpaywall.md @@ -168,10 +168,11 @@ Then run crawl (see `journal-crawls` git repo). ## Post-Crawl Bulk Ingest cat /srv/sandcrawler/tasks/unpaywall_crawl_ingest_2021-07-02.json | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => 1.74M 0:01:59 [14.6k/s] ## Post-Ingest Stats -Overall status (unpaywall, all time): +Only the recent updates: SELECT ingest_file_result.status, COUNT(*) FROM ingest_request @@ -181,12 +182,36 @@ Overall status (unpaywall, all time): WHERE ingest_request.ingest_type = 'pdf' AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' GROUP BY status ORDER BY COUNT DESC LIMIT 20; + status | count + -------------------------+--------- + success | 2690258 + redirect-loop | 227328 + no-capture | 157368 + terminal-bad-status | 118943 + no-pdf-link | 92698 + blocked-cookie | 19478 + link-loop | 9249 + wrong-mimetype | 4918 + cdx-error | 1786 + wayback-error | 1497 + null-body | 1302 + body-too-large | 433 + wayback-content-error | 245 + petabox-error | 171 + gateway-timeout | 138 + invalid-host-resolution | 120 + timeout | 12 + bad-redirect | 4 + | 3 + spn2-cdx-lookup-failure | 1 + (20 rows) -Ingest stats broken down by publication stage: +Only the recent updates, by publication stage: SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) FROM ingest_request @@ -196,14 +221,83 @@ Ingest stats broken down by publication stage: WHERE ingest_request.ingest_type = 'pdf' AND ingest_request.link_source = 'unpaywall' + AND date(ingest_request.created) > '2021-07-01' GROUP BY release_stage, status ORDER BY release_stage, COUNT DESC LIMIT 100; + release_stage | status | count + ---------------+-------------------------+--------- + accepted | success | 103144 + accepted | no-pdf-link | 53981 + accepted | terminal-bad-status | 4102 + accepted | link-loop | 2799 + accepted | no-capture | 2315 + accepted | redirect-loop | 2171 + accepted | blocked-cookie | 234 + accepted | cdx-error | 140 + accepted | wayback-error | 101 + accepted | wrong-mimetype | 38 + accepted | null-body | 10 + accepted | petabox-error | 5 + accepted | wayback-content-error | 4 + accepted | gateway-timeout | 2 + accepted | body-too-large | 2 + published | success | 1919100 + published | no-capture | 130104 + published | redirect-loop | 127482 + published | terminal-bad-status | 43118 + published | no-pdf-link | 33505 + published | blocked-cookie | 19034 + published | link-loop | 6241 + published | wrong-mimetype | 4163 + published | null-body | 1195 + published | cdx-error | 1151 + published | wayback-error | 1105 + published | wayback-content-error | 197 + published | body-too-large | 195 + published | petabox-error | 118 + published | gateway-timeout | 35 + published | invalid-host-resolution | 13 + published | timeout | 8 + published | bad-redirect | 2 + published | spn2-cdx-lookup-failure | 1 + published | bad-gzip-encoding | 1 + submitted | success | 668014 + submitted | redirect-loop | 97675 + submitted | terminal-bad-status | 71723 + submitted | no-capture | 24949 + submitted | no-pdf-link | 5212 + submitted | wrong-mimetype | 717 + submitted | cdx-error | 495 + submitted | wayback-error | 291 + submitted | body-too-large | 236 + submitted | blocked-cookie | 210 + submitted | link-loop | 209 + submitted | invalid-host-resolution | 107 + submitted | gateway-timeout | 101 + submitted | null-body | 97 + submitted | petabox-error | 48 + submitted | wayback-content-error | 44 + submitted | timeout | 4 + submitted | | 3 + submitted | bad-redirect | 2 + submitted | remote-server-error | 1 + (55 rows) -Only the recent updates: +In total, this iteration of unpaywall ingest resulted in: - SELECT ingest_file_result.status, COUNT(*) +- 3,325,954 raw ingest requests (new URLs total) +- 1,743,186 (52% of all) of these had not been seen/crawled from any source yet (?), and attempted to crawl +- 1,346,654 (77% of crawled) success from new heritrix crawling +- 2,690,258 (80%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) + +## Live Ingest Follow-Up + +Will run SPN requests on the ~160k `no-capture` URLs: + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request LEFT JOIN ingest_file_result ON ingest_file_result.ingest_type = ingest_request.ingest_type @@ -212,13 +306,15 @@ Only the recent updates: ingest_request.ingest_type = 'pdf' AND ingest_request.link_source = 'unpaywall' AND date(ingest_request.created) > '2021-07-01' - GROUP BY status - ORDER BY COUNT DESC - LIMIT 20; + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json'; + => COPY 157371 -In total, this iteration of unpaywall ingest resulted in: + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json + => 157k 0:00:04 [31.6k/s] + +Enqueue the whole batch: -- XXX raw ingest requests (new URLs total) -- XXX (YY%) of these had not been seen/crawled from any source yet -- XXX (YY%) success from new heritrix crawling -- XXX (YY%) total success (including crawled initially for other reasons; out of all new URLs including those not expected to be success) + cat /srv/sandcrawler/tasks/unpaywall_noingest_2021-07-30.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + => DONE |