diff options
Diffstat (limited to 'notes/ingest/2021-08_mag.md')
-rw-r--r-- | notes/ingest/2021-08_mag.md | 400 |
1 files changed, 400 insertions, 0 deletions
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md new file mode 100644 index 0000000..5f92196 --- /dev/null +++ b/notes/ingest/2021-08_mag.md @@ -0,0 +1,400 @@ + +Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest. +Also want to re-ingest some old/failed ingests, now that pipeline/code has +improved. + +Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs. + + +## Persist Ingest Requests + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000}) + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request - + => 22.5M 0:46:00 [8.16k/s] + => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585}) + +Roughly 8.6 million new URLs + +## Pre-Crawl Status Counts + +Status of combined old and new requests, with some large domains removed: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + success | 26123975 + | 6664846 + no-pdf-link | 1859908 + redirect-loop | 1532405 + no-capture | 1199126 + link-loop | 1157010 + terminal-bad-status | 832362 + gateway-timeout | 202158 + spn2-cdx-lookup-failure | 81406 + wrong-mimetype | 69087 + invalid-host-resolution | 37262 + wayback-error | 21340 + petabox-error | 11237 + null-body | 9414 + wayback-content-error | 2199 + cdx-error | 1893 + spn2-error | 1741 + spn2-error:job-failed | 971 + blocked-cookie | 902 + spn2-error:invalid-url-syntax | 336 + (20 rows) + +And just the new URLs (note that domain filter shouldn't be required, but +keeping for consistency): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + | 6664780 + success | 1957844 + redirect-loop | 23357 + terminal-bad-status | 9385 + no-pdf-link | 8315 + no-capture | 6892 + link-loop | 4517 + wrong-mimetype | 3864 + cdx-error | 1749 + blocked-cookie | 842 + null-body | 747 + wayback-error | 688 + wayback-content-error | 570 + gateway-timeout | 367 + petabox-error | 340 + spn2-cdx-lookup-failure | 150 + read-timeout | 122 + not-found | 119 + invalid-host-resolution | 63 + spn2-error | 23 + (20 rows) + +## Dump Initial Bulk Ingest Requests + +Note that this is all-time, not just recent, and will re-process a lot of +"no-pdf-link": + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-pdf-link' + OR ingest_file_result.status = 'cdx-error' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json'; + => COPY 8526647 + +Transform to ingest requests: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json + => 8.53M 0:03:40 + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + +Updated stats after running initial bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 5184994 + no-capture | 3284416 + redirect-loop | 98685 + terminal-bad-status | 28733 + link-loop | 28518 + blocked-cookie | 22338 + no-pdf-link | 19073 + wrong-mimetype | 9122 + null-body | 2793 + wayback-error | 2128 + wayback-content-error | 1233 + cdx-error | 1198 + petabox-error | 617 + gateway-timeout | 395 + not-found | 130 + read-timeout | 128 + | 111 + invalid-host-resolution | 63 + spn2-cdx-lookup-failure | 24 + spn2-error | 20 + (20 rows) + +## Generate Seedlist + +For crawling, do a similar (but not identical) dump: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json'; + => COPY 4599519 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json + => 4.60M 0:02:55 [26.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + => DONE + + wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt + 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + +## Post-Crawl Bulk Re-Ingest + +Got about 1.8 million new PDFs from crawl, and a sizable fraction of dupes (by +hash, URL agnostic). + +Enqueue for buik re-ingest: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => Thu 19 Aug 2021 09:10:59 PM UTC + + +## Post-Ingest Stats + +Just the new stuff (compare against above for delta): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 7748241 89.2% + no-capture | 429688 4.9% + redirect-loop | 172831 2.0% + terminal-bad-status | 94029 1.1% + no-pdf-link | 86437 1.0% + blocked-cookie | 67903 0.8% + link-loop | 50622 + wrong-mimetype | 21064 + null-body | 6650 + cdx-error | 3313 + wayback-error | 2630 + gateway-timeout | 399 + petabox-error | 268 + wayback-content-error | 170 + not-found | 130 + read-timeout | 128 + | 109 + invalid-host-resolution | 63 + bad-redirect | 39 + spn2-error | 20 + (20 rows) + +New success due to crawl (new batch only): 7748241 - 1957844 = 5,790,397 + +Overall success of new batch: 7748241. / 8686315 = 89.2% + +And combined (old and new) status again: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------------+---------- + success | 31990062 + redirect-loop | 1704717 + no-capture | 1263462 + link-loop | 1218280 + blocked-cookie | 1213838 + no-pdf-link | 1096664 + terminal-bad-status | 960070 + gateway-timeout | 202190 + wrong-mimetype | 86557 + invalid-host-resolution | 37262 + null-body | 15443 + wayback-error | 12839 + cdx-error | 4047 + spn2-error | 1731 + spn2-error:job-failed | 962 + petabox-error | 463 + wayback-content-error | 379 + spn2-error:invalid-url-syntax | 336 + spn2-error:soft-time-limit-exceeded | 203 + | 175 + (20 rows) + +New success total: 31990062 - 26123975 = 5,866,087 + +A full 1,263,462 no-capture that could be attempted... though many of those may +be excluded for a specific reason. |