diff options
Diffstat (limited to 'notes/ingest/2020-10_unpaywall.md')
-rw-r--r-- | notes/ingest/2020-10_unpaywall.md | 286 |
1 files changed, 286 insertions, 0 deletions
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md new file mode 100644 index 0000000..a991025 --- /dev/null +++ b/notes/ingest/2020-10_unpaywall.md @@ -0,0 +1,286 @@ + +New snapshot released 2020-10-09. Want to do a mostly straight-forward +load/ingest/crawl. + +Proposed changes this time around: + +- have bulk ingest store missing URLs in a new sandcrawler-db for `no-capture` + status, and to include those URLs in heritrix3 crawl +- tweak heritrix3 config for additional PDF URL extraction patterns, + particularly to improve OJS yield + + +## Transform and Load + + # in sandcrawler pipenv on aitio + zcat /schnell/unpaywall/unpaywall_snapshot_2020-10-09T153852.jsonl.gz | ./scripts/unpaywall2ingestrequest.py - | pv -l > /grande/snapshots/unpaywall_snapshot_2020-10-09.ingest_request.json + => 28.3M 3:19:03 [2.37k/s] + + cat /grande/snapshots/unpaywall_snapshot_2020-04-27.ingest_request.json | pv -l | ./persist_tool.py ingest-request - + => 28.3M 1:11:29 [ 6.6k/s] + => Worker: Counter({'total': 28298500, 'insert-requests': 4119939, 'update-requests': 0}) + => JSON lines pushed: Counter({'total': 28298500, 'pushed': 28298500}) + +## Dump new URLs, Transform, Bulk Ingest + + COPY ( + SELECT row_to_json(ingest_request.*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + -- AND date(ingest_request.created) > '2020-10-09' + AND (ingest_file_result.status IS NULL + OR ingest_file_result.status = 'no-capture') + ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + => COPY 4216339 + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + => 4.22M 0:02:48 [ 25k/s] + +Start small, to test no-capture behavior: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +`no-capture` change looks good. Enqueue the whole batch: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 23661282 + no-capture | 3015447 + no-pdf-link | 2302102 + redirect-loop | 1542566 + terminal-bad-status | 1044676 + wrong-mimetype | 114315 + link-loop | 36358 + cdx-error | 20150 + null-body | 14513 + wayback-error | 13644 + gateway-timeout | 3776 + spn2-cdx-lookup-failure | 1260 + petabox-error | 1171 + redirects-exceeded | 752 + invalid-host-resolution | 464 + spn2-error | 147 + bad-redirect | 131 + spn2-error:job-failed | 91 + wayback-content-error | 45 + timeout | 19 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + ) t1 + ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json'; + => 2,936,404 + + # TODO: in the future also exclude "www.archive.org" + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json + +And actually dump seedlist(s): + + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt + + wc -l unpaywall_seedlist_2020-11-02.*.txt + 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt + 2713866 unpaywall_seedlist_2020-11-02.url.txt + +With things like jsessionid, suspect that crawling just the terminal URLs is +going to work better than both full and terminal. + +Finding a fraction of `no-capture` which have partial/stub URLs as terminal. + +TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP). + + +## Bulk Ingest and Status + +Note, removing archive.org links: + + cat /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json | rg -v www.archive.org | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +Overall status (checked 2020-12-08): + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + status | count + -------------------------+---------- + success | 25004559 + no-pdf-link | 2531841 + redirect-loop | 1671375 + terminal-bad-status | 1389463 + no-capture | 893880 + wrong-mimetype | 119332 + link-loop | 66508 + wayback-content-error | 30339 + cdx-error | 21790 + null-body | 20710 + wayback-error | 13976 + gateway-timeout | 3775 + petabox-error | 2420 + spn2-cdx-lookup-failure | 1218 + redirects-exceeded | 889 + invalid-host-resolution | 464 + bad-redirect | 147 + spn2-error | 112 + spn2-error:job-failed | 91 + timeout | 21 + (20 rows) + +Ingest stats broken down by publication stage: + + SELECT ingest_request.release_stage, ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY release_stage, status + ORDER BY release_stage, COUNT DESC + LIMIT 100; + + + release_stage | status | count + ---------------+-------------------------------------+---------- + accepted | success | 1101090 + accepted | no-pdf-link | 28590 + accepted | redirect-loop | 10923 + accepted | no-capture | 9540 + accepted | terminal-bad-status | 6339 + accepted | cdx-error | 952 + accepted | wrong-mimetype | 447 + accepted | link-loop | 275 + accepted | wayback-error | 202 + accepted | petabox-error | 177 + accepted | redirects-exceeded | 122 + accepted | null-body | 27 + accepted | wayback-content-error | 14 + accepted | spn2-cdx-lookup-failure | 5 + accepted | gateway-timeout | 4 + accepted | bad-redirect | 1 + published | success | 18595278 + published | no-pdf-link | 2434935 + published | redirect-loop | 1364110 + published | terminal-bad-status | 1185328 + published | no-capture | 718792 + published | wrong-mimetype | 112923 + published | link-loop | 63874 + published | wayback-content-error | 30268 + published | cdx-error | 17302 + published | null-body | 15209 + published | wayback-error | 10782 + published | gateway-timeout | 1966 + published | petabox-error | 1611 + published | spn2-cdx-lookup-failure | 879 + published | redirects-exceeded | 760 + published | invalid-host-resolution | 453 + published | bad-redirect | 115 + published | spn2-error:job-failed | 77 + published | spn2-error | 75 + published | timeout | 21 + published | bad-gzip-encoding | 5 + published | spn2-error:soft-time-limit-exceeded | 4 + published | spn2-error:pending | 1 + published | blocked-cookie | 1 + published | | 1 + published | pending | 1 + submitted | success | 5308166 + submitted | redirect-loop | 296322 + submitted | terminal-bad-status | 197785 + submitted | no-capture | 165545 + submitted | no-pdf-link | 68274 + submitted | wrong-mimetype | 5962 + submitted | null-body | 5474 + submitted | cdx-error | 3536 + submitted | wayback-error | 2992 + submitted | link-loop | 2359 + submitted | gateway-timeout | 1805 + submitted | petabox-error | 632 + submitted | spn2-cdx-lookup-failure | 334 + submitted | wayback-content-error | 57 + submitted | spn2-error | 37 + submitted | bad-redirect | 31 + submitted | spn2-error:job-failed | 14 + submitted | | 12 + submitted | invalid-host-resolution | 11 + submitted | redirects-exceeded | 7 + submitted | spn2-error:soft-time-limit-exceeded | 5 + submitted | bad-gzip-encoding | 1 + submitted | skip-url-blocklist | 1 + | no-pdf-link | 42 + | success | 25 + | redirect-loop | 20 + | terminal-bad-status | 11 + | no-capture | 3 + (70 rows) |