diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-12-31 12:17:00 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-12-31 12:17:00 -0800 |
commit | b0d4ac285f101a641cee0e87da80d47a468c08aa (patch) | |
tree | f9aabc05b6ca43528c376b56ed3ae7351c15f2b4 /notes/ingest | |
parent | 476fa2ff8c5e561287390505c17caf1888d6b9f4 (diff) | |
download | sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.tar.gz sandcrawler-b0d4ac285f101a641cee0e87da80d47a468c08aa.zip |
DOAJ crawl ingest stats
Diffstat (limited to 'notes/ingest')
-rw-r--r-- | notes/ingest/2020-11_doaj.md | 295 |
1 files changed, 295 insertions, 0 deletions
diff --git a/notes/ingest/2020-11_doaj.md b/notes/ingest/2020-11_doaj.md new file mode 100644 index 0000000..473dd0d --- /dev/null +++ b/notes/ingest/2020-11_doaj.md @@ -0,0 +1,295 @@ + +This is the first ingest (and crawl) of URLs from DOAJ article-level metadata. +It will include at least 'pdf' and 'html' ingest requests, not just 'pdf' as in +the past. + +Working off a 2020-11-13 snapshot. + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:28 [4.57k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => ran in to error with blank `base_url` + +Second try after patches: + + zcat /schnell/DOAJ-CRAWL-2020-11/doaj_article_data_2020-11-13_all.json.gz | ./scripts/doaj2ingestrequest.py - | pv -l > /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json + => 6.7M 0:24:29 [4.56k/s] + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => Worker: Counter({'total': 6703036, 'insert-requests': 163854, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 6703036, 'pushed': 6703036}) + +## Check Pre-Crawl Status + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + -- next time include ingest_type in sort + ORDER BY COUNT DESC + LIMIT 30; + + + ingest_type | status | count + -------------+-------------------------+--------- + pdf | | 3711532 + html | | 2429003 + pdf | success | 454403 + pdf | redirect-loop | 48587 + pdf | no-pdf-link | 24901 + pdf | no-capture | 11569 + xml | | 9442 + pdf | link-loop | 8466 + pdf | terminal-bad-status | 2015 + pdf | wrong-mimetype | 1441 + pdf | null-body | 1057 + pdf | petabox-error | 299 + pdf | cdx-error | 124 + pdf | gateway-timeout | 114 + pdf | wayback-error | 77 + pdf | spn2-cdx-lookup-failure | 20 + pdf | invalid-host-resolution | 4 + pdf | spn2-error | 1 + (18 rows) + +## Dump new URLs, Transform, Bulk Ingest (PDF and XML only) + +Dump: + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.link_source = 'doaj' + -- AND date(ingest_request.created) > '2020-12-01' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/doaj_noingest_2020-11-19.rows.json'; + => COPY 3732543 + +Transform: + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_noingest_2020-11-19.rows.json | pv -l | shuf > /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json + => 3.73M 0:02:18 [26.9k/s] + +Definitely some non-URL strings in there; should try to filter those out +earlier in the transform process. And/or have a constraint on the URL column in +the database. + +Enqueue the whole batch: + + cat /grande/snapshots/doaj_noingest_2020-11-19.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Started this batch off at 2020-11-19 18:10 (Pacific time) + +Stats after run: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 30; + +## Dump Seedlist + +After preliminary bulk ingest attempts, dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.base_url = ingest_request.base_url + AND ingest_file_result.ingest_type = ingest_request.ingest_type + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_file_result.status != 'success' + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + ) t1 + ) TO '/grande/snapshots/doaj_seedlist_2020-11-19.rows.json'; + => 1,899,555 + +TODO: filter for valid URLs + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | pv -l > /grande/snapshots/doaj_crawl_ingest_2020-11-19.json + +And actually dump seedlist(s): + + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.terminal_url.txt + cat /grande/snapshots/doaj_seedlist_2020-11-19.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/doaj_seedlist_2020-11-19.no_terminal_url.txt + + wc -l doaj_seedlist_2020-11-19.*.txt + +## Post-Crawl Ingest + +Re-run all ingests, from original batch (pdf, xml, and html), now that DOAJ +identifiers are all in fatcat: + + cat /schnell/DOAJ-CRAWL-2020-11/doaj_20201113.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + # started 2020-12-23 15:05 (Pacific) + # finished around 2020-12-31, after one long/slow partition + +Stats again after everything: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+--------------------------+--------- + html | wrong-scope | 1089423 + html | no-capture | 423917 + html | redirect-loop | 212910 + html | unknown-scope | 204069 + html | html-resource-no-capture | 165587 + html | success | 122937 + html | null-body | 100296 + html | wayback-content-error | 53918 + html | wrong-mimetype | 18908 + html | terminal-bad-status | 14059 + html | petabox-error | 13520 + html | cdx-error | 6823 + html | wayback-error | 890 + html | | 620 + html | blocked-cookie | 543 + html | blocked-captcha | 250 + html | redirects-exceeded | 135 + html | too-many-resources | 111 + html | max-hops-exceeded | 84 + html | bad-redirect | 3 + pdf | success | 2851324 + pdf | no-pdf-link | 529914 + pdf | redirect-loop | 349494 + pdf | no-capture | 272202 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91796 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2968 + pdf | | 2068 + pdf | wayback-content-error | 1548 + pdf | cdx-error | 1095 + pdf | petabox-error | 1024 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | timeout | 20 + pdf | max-hops-exceeded | 19 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + xml | cdx-error | 3 + (43 rows) + + +And on filtered subset that we actually crawled: + + SELECT ingest_request.ingest_type, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.link_source = 'doaj' + AND (ingest_request.ingest_type = 'pdf' + OR ingest_request.ingest_type = 'xml') + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_request.base_url NOT LIKE '%://archive.org/%' + AND ingest_request.base_url NOT LIKE '%://web.archive.org/%' + AND ingest_request.base_url NOT LIKE '%://www.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%://archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://web.archive.org/%' + AND ingest_file_result.terminal_url NOT LIKE '%://www.archive.org/%' + GROUP BY ingest_request.ingest_type, status + ORDER BY ingest_request.ingest_type, COUNT DESC + LIMIT 50; + + ingest_type | status | count + -------------+-----------------------+--------- + pdf | success | 2851286 + pdf | no-pdf-link | 527495 + pdf | redirect-loop | 345138 + pdf | no-capture | 268140 + pdf | null-body | 129027 + pdf | terminal-bad-status | 91125 + pdf | link-loop | 25267 + pdf | wrong-mimetype | 6504 + pdf | wayback-error | 2907 + pdf | petabox-error | 363 + pdf | wayback-content-error | 242 + pdf | bad-redirect | 203 + pdf | redirects-exceeded | 135 + pdf | max-hops-exceeded | 19 + pdf | cdx-error | 15 + pdf | bad-gzip-encoding | 2 + xml | success | 6897 + xml | null-body | 2353 + xml | wrong-mimetype | 184 + xml | no-capture | 5 + (20 rows) + |