diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-08-13 13:57:57 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-08-13 13:57:57 -0700 |
commit | eab9b929a05da3fa25f4bfaffd84bb0d7b219c73 (patch) | |
tree | ec02f3d675f81779f217df3ddd9322fa15b5863f /notes/ingest/2021-08_mag.md | |
parent | ccb2c72c170d6736af675734906c8957ee176a8b (diff) | |
download | sandcrawler-eab9b929a05da3fa25f4bfaffd84bb0d7b219c73.tar.gz sandcrawler-eab9b929a05da3fa25f4bfaffd84bb0d7b219c73.zip |
MAG and OAI-PMH crawl/processing notes
Diffstat (limited to 'notes/ingest/2021-08_mag.md')
-rw-r--r-- | notes/ingest/2021-08_mag.md | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/notes/ingest/2021-08_mag.md b/notes/ingest/2021-08_mag.md new file mode 100644 index 0000000..5bab4f0 --- /dev/null +++ b/notes/ingest/2021-08_mag.md @@ -0,0 +1,276 @@ + +Using 2021-06-07 upstream MAG snapshot to run a crawl and do some re-ingest. +Also want to re-ingest some old/failed ingests, now that pipeline/code has +improved. + +Ran munging from `scratch:ingest/mag` notes first. Yielded 22.5M PDF URLs. + + +## Persist Ingest Requests + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | head -n1000 | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 1000, 'insert-requests': 276, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 1000, 'pushed': 1000}) + + zcat /srv/sandcrawler/tasks/ingest_requests_mag-2021-06-07.json.gz | pv -l | ./persist_tool.py ingest-request - + => 22.5M 0:46:00 [8.16k/s] + => Worker: Counter({'total': 22527585, 'insert-requests': 8686315, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 22527585, 'pushed': 22527585}) + +Roughly 8.6 million new URLs + +## Pre-Crawl Status Counts + +Status of combined old and new requests, with some large domains removed: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + -- AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------------+---------- + success | 26123975 + | 6664846 + no-pdf-link | 1859908 + redirect-loop | 1532405 + no-capture | 1199126 + link-loop | 1157010 + terminal-bad-status | 832362 + gateway-timeout | 202158 + spn2-cdx-lookup-failure | 81406 + wrong-mimetype | 69087 + invalid-host-resolution | 37262 + wayback-error | 21340 + petabox-error | 11237 + null-body | 9414 + wayback-content-error | 2199 + cdx-error | 1893 + spn2-error | 1741 + spn2-error:job-failed | 971 + blocked-cookie | 902 + spn2-error:invalid-url-syntax | 336 + (20 rows) + +And just the new URLs (note that domain filter shouldn't be required, but +keeping for consistency): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + | 6664780 + success | 1957844 + redirect-loop | 23357 + terminal-bad-status | 9385 + no-pdf-link | 8315 + no-capture | 6892 + link-loop | 4517 + wrong-mimetype | 3864 + cdx-error | 1749 + blocked-cookie | 842 + null-body | 747 + wayback-error | 688 + wayback-content-error | 570 + gateway-timeout | 367 + petabox-error | 340 + spn2-cdx-lookup-failure | 150 + read-timeout | 122 + not-found | 119 + invalid-host-resolution | 63 + spn2-error | 23 + (20 rows) + +## Dump Initial Bulk Ingest Requests + +Note that this is all-time, not just recent, and will re-process a lot of +"no-pdf-link": + + COPY ( + SELECT row_to_json(ingest_request.*) FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-pdf-link' + OR ingest_file_result.status = 'cdx-error' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json'; + => COPY 8526647 + +Transform to ingest requests: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json + => 8.53M 0:03:40 + +Enqueue the whole batch: + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-03.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + => DONE + +Updated stats after running initial bulk ingest: + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + AND ingest_request.created > '2021-06-01' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+--------- + success | 5184994 + no-capture | 3284416 + redirect-loop | 98685 + terminal-bad-status | 28733 + link-loop | 28518 + blocked-cookie | 22338 + no-pdf-link | 19073 + wrong-mimetype | 9122 + null-body | 2793 + wayback-error | 2128 + wayback-content-error | 1233 + cdx-error | 1198 + petabox-error | 617 + gateway-timeout | 395 + not-found | 130 + read-timeout | 128 + | 111 + invalid-host-resolution | 63 + spn2-cdx-lookup-failure | 24 + spn2-error | 20 + (20 rows) + +## Generate Seedlist + +For crawling, do a similar (but not identical) dump: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'mag' + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'wayback-content-error' + OR ingest_file_result.status = 'petabox-error' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%researchgate.net%' + AND ingest_request.base_url NOT LIKE '%muse.jhu.edu%' + AND ingest_request.base_url NOT LIKE '%omicsonline.org%' + AND ingest_request.base_url NOT LIKE '%link.springer.com%' + AND ingest_request.base_url NOT LIKE '%ieeexplore.ieee.org%' + ) t1 + ) TO '/srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json'; + => COPY 4599519 + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | pv -l > /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.ingest_request.json + => 4.60M 0:02:55 [26.2k/s] + +And actually dump seedlist(s): + + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | jq -r .base_url | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + cat /srv/sandcrawler/tasks/mag_ingest_request_2021-08-11.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt + cat /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt | sort -u -S 4G > /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + => DONE + + wc -l /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.*.txt + 4593238 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.base_url.txt + 4632911 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.combined.txt + 3294710 /srv/sandcrawler/tasks/mag_seedlist_2021-08-11.terminal_url.txt |