diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-03-11 12:24:19 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-03-11 12:24:19 -0800 |
commit | 257f41b174e04957aecf298b3ecdaae0ab44a1d2 (patch) | |
tree | 93a5ba9fafe6823474d995118d0626ed2200ea27 | |
parent | 9d096b26e35802553263d6472a534deb381e65da (diff) | |
download | sandcrawler-257f41b174e04957aecf298b3ecdaae0ab44a1d2.tar.gz sandcrawler-257f41b174e04957aecf298b3ecdaae0ab44a1d2.zip |
DOAJ ingest/crawl notes
-rw-r--r-- | notes/ingest/2022-03_doaj.md | 266 |
1 files changed, 266 insertions, 0 deletions
diff --git a/notes/ingest/2022-03_doaj.md b/notes/ingest/2022-03_doaj.md new file mode 100644 index 0000000..bace480 --- /dev/null +++ b/notes/ingest/2022-03_doaj.md @@ -0,0 +1,266 @@ + +plan: +- usual setup and dump ingest requests +- filter ingest requests to targetted ccTLDs, and add those to crawl first + +## Transform and Load + + # on sandcrawler-vm + mkdir -p /srv/sandcrawler/tasks/doaj + cd /srv/sandcrawler/tasks/doaj + wget 'https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2022-03-07_all.json.gz' + + # in pipenv, in python directory + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l | gzip > /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz + # 9.08M 0:37:38 [4.02k/s] + + zcat /srv/sandcrawler/tasks/doaj/doaj_article_data_2022-03-07_all.ingest_request.json.gz | pv -l | ./persist_tool.py ingest-request - + # Worker: Counter({'total': 9082373, 'insert-requests': 2982535, 'update-requests': 0}) + # JSON lines pushed: Counter({'total': 9082373, 'pushed': 9082373}) + + +## Check Pre-Crawl Status + +2022-03-09, before the above load: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 2919808 + html | wrong-scope | 1098998 + pdf | no-pdf-link | 481532 + pdf | redirect-loop | 429006 + html | success | 342501 + html | unknown-scope | 225390 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187762 + html | no-capture | 185418 + pdf | no-capture | 171273 + pdf | null-body | 129028 + html | null-body | 100296 + pdf | terminal-bad-status | 91551 + pdf | link-loop | 25447 + html | wrong-mimetype | 22640 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + pdf | wrong-mimetype | 7688 + xml | success | 6897 + html | petabox-error | 5529 + pdf | wayback-error | 2706 + xml | null-body | 2353 + pdf | | 2063 + pdf | wayback-content-error | 1349 + html | cdx-error | 1169 + pdf | cdx-error | 1130 + pdf | petabox-error | 679 + html | | 620 + pdf | empty-blob | 562 + html | blocked-cookie | 545 + (30 rows) + +After the above load: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3036457 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108132 + pdf | no-pdf-link | 485703 + pdf | redirect-loop | 436085 + html | success | 342594 + html | unknown-scope | 225412 + html | redirect-loop | 223927 + html | html-resource-no-capture | 187999 + html | no-capture | 187310 + pdf | no-capture | 172033 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91799 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19028 + html | terminal-bad-status | 13327 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6897 + html | petabox-error | 5530 + pdf | wayback-error | 2707 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 771 + pdf | empty-blob | 562 + (30 rows) + +Dump ingest requests for crawling (or bulk ingest first?): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json'; + => COPY 353819 + +Not that many! Guess the filters are important? + + SELECT COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ); + => 3202164 + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json + => 353k 0:00:16 [21.0k/s] + +Bulk ingest: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-09.requests.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Dump seeds again (for crawling): + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + -- AND (ingest_request.ingest_type = 'pdf' + -- OR ingest_request.ingest_type = 'xml') + AND ( + ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json'; + # COPY 350661 + +And stats again: + + ingest_type | status | count + -------------+--------------------------+--------- + pdf | success | 3037059 + pdf | | 1623208 + html | | 1208412 + html | wrong-scope | 1108476 + pdf | no-pdf-link | 485705 + pdf | redirect-loop | 436850 + html | success | 342762 + html | unknown-scope | 225412 + html | redirect-loop | 224683 + html | html-resource-no-capture | 188058 + html | no-capture | 185734 + pdf | no-capture | 170452 + pdf | null-body | 129266 + html | null-body | 100296 + pdf | terminal-bad-status | 91875 + pdf | link-loop | 26933 + html | wrong-mimetype | 22643 + html | wayback-content-error | 19042 + html | terminal-bad-status | 13333 + xml | | 11196 + pdf | wrong-mimetype | 7929 + xml | success | 6898 + html | petabox-error | 5535 + pdf | wayback-error | 2711 + xml | null-body | 2353 + pdf | wayback-content-error | 1353 + pdf | cdx-error | 1177 + html | cdx-error | 1172 + pdf | petabox-error | 772 + html | blocked-cookie | 769 + (30 rows) + +Transform: + + ./scripts/ingestrequest_row2json.py /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.rows.json | pv -l | shuf > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json + +Create seedlist: + + cat /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.requests.json \ + | jq -r .base_url \ + | sort -u -S 4G \ + > /srv/sandcrawler/tasks/doaj_seedlist_2022-03-10.txt + +Send off an added to `TARGETED-ARTICLE-CRAWL-2022-03` heritrix crawl, will +re-ingest when that completes (a week or two?). |