diff options
Diffstat (limited to 'notes')
-rw-r--r-- | notes/ingest/2020-10_unpaywall.md | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md index d474176..7b97d08 100644 --- a/notes/ingest/2020-10_unpaywall.md +++ b/notes/ingest/2020-10_unpaywall.md @@ -36,7 +36,117 @@ Proposed changes this time around: AND (ingest_file_result.status IS NULL OR ingest_file_result.status = 'no-capture') ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + => COPY 4216339 ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + => 4.22M 0:02:48 [ 25k/s] +Start small, to test no-capture behavior: + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +`no-capture` change looks good. Enqueue the whole batch: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 23661282 + no-capture | 3015447 + no-pdf-link | 2302102 + redirect-loop | 1542566 + terminal-bad-status | 1044676 + wrong-mimetype | 114315 + link-loop | 36358 + cdx-error | 20150 + null-body | 14513 + wayback-error | 13644 + gateway-timeout | 3776 + spn2-cdx-lookup-failure | 1260 + petabox-error | 1171 + redirects-exceeded | 752 + invalid-host-resolution | 464 + spn2-error | 147 + bad-redirect | 131 + spn2-error:job-failed | 91 + wayback-content-error | 45 + timeout | 19 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + ) t1 + ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json'; + => 2,936,404 + + # TODO: in the future also exclude "www.archive.org" + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json + +And actually dump seedlist(s): + + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt + + wc -l unpaywall_seedlist_2020-11-02.*.txt + 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt + 2713866 unpaywall_seedlist_2020-11-02.url.txt + +With things like jsessionid, suspect that crawling just the terminal URLs is +going to work better than both full and terminal. + +Finding a fraction of `no-capture` which have partial/stub URLs as terminal. + +TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP). |