diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-28 10:33:28 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-28 10:33:28 -0700 |
commit | 6de98c221cc9fe1e5410c52a08b1d3b7470cd6ea (patch) | |
tree | f8f13297e176c9f3e4cef78094a25341c9411e32 /notes/ingest | |
parent | a8d76d29c23b9aaf32fe531e56244bb3422a23aa (diff) | |
download | sandcrawler-6de98c221cc9fe1e5410c52a08b1d3b7470cd6ea.tar.gz sandcrawler-6de98c221cc9fe1e5410c52a08b1d3b7470cd6ea.zip |
update MAG crawl notes
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2020-03-04_mag.md | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/notes/ingest/2020-03-04_mag.md b/notes/ingest/2020-03-04_mag.md index 97594c8..9b000a3 100644 --- a/notes/ingest/2020-03-04_mag.md +++ b/notes/ingest/2020-03-04_mag.md @@ -406,3 +406,74 @@ Full run: 2020-04-07 12:19 (pacific): 11,703,871 +## Post-bulk-ingest + +Around 2020-04-28, seems like main wave of bulk ingest is complete. Will need +to re-try things like cdx-error. + +Current status: + + status | count + -------------------------------+---------- + success | 18491799 + redirect-loop | 1968530 + no-capture | 1373657 + no-pdf-link | 1311842 + link-loop | 1296439 + terminal-bad-status | 627577 + cdx-error | 418278 + wrong-mimetype | 50141 + wayback-error | 37159 + petabox-error | 11249 + null-body | 6295 + gateway-timeout | 3051 + spn2-cdx-lookup-failure | 328 + spn2-error:invalid-url-syntax | 93 + bad-redirect | 75 + | 47 + invalid-host-resolution | 28 + spn2-error | 10 + bad-gzip-encoding | 7 + redirects-exceeded | 2 + (20 rows) + +Lots of cdx-error to retry. + +The no-capture links are probably a mix of domain-blocklist and things that +failed in bulk mode. Will dump and re-attempt them: + + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_file_result.status = 'no-capture' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + ) TO '/grande/snapshots/mag_nocapture_20200420.rows.json'; + => 859849 + +What domains are these? + + cat mag_nocapture_20200420.rows.json | jq .base_url -r | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n30 + +Let's filter down more: + + cat mag_nocapture_20200420.rows.json | rg -v 'www.researchgate.net' | rg -v 'muse.jhu.edu' | rg -v 'www.omicsonline.org' | rg -v 'link.springer.com' | rg -v 'iopscience.iop.org' | rg -v 'ieeexplore.ieee.org' | shuf > mag_nocapture_20200420.rows.filtered.json + + wc -l mag_nocapture_20200420.rows.filtered.json + 423085 mag_nocapture_20200420.rows.filtered.json + +Ok, enqueue! + + cat mag_nocapture_20200420.rows.filtered.json | shuf | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests -p -1 + |