diff options
-rw-r--r-- | kafka/topics.md | 20 | ||||
-rw-r--r-- | notes/ingest/2020-10_unpaywall.md | 110 |
2 files changed, 130 insertions, 0 deletions
diff --git a/kafka/topics.md b/kafka/topics.md index fa1bd6d..06faf8e 100644 --- a/kafka/topics.md +++ b/kafka/topics.md @@ -122,6 +122,22 @@ retention (on both a size and time basis). fatcat-ENV.file-updates => key: fcid => 4x partitions + fatcat-ENV.work-ident-updates + => work identifiers when updated and needs re-indexing (eg, in scholar) + => 6x partitions + => key: doc ident ("work_{ident}") + => key compaction possible; long retention + + scholar-ENV.sim-updates + => 6x partitions + => key: "sim_item_{}" + => key compaction possible; long retention + scholar-ENV.update-docs + => 12x partitions + => key: scholar doc identifer + => gzip compression + => key compaction possible + => short time-based retention (2 months?) ### Deprecated/Unused Topics @@ -169,6 +185,7 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.work-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.file-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 4 --topic fatcat-qa.container-updates + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic fatcat-qa.work-ident-updates ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-crossref ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 8 --topic fatcat-qa.api-datacite --config cleanup.policy=compact @@ -187,6 +204,9 @@ exists`; this seems safe, and the settings won't be over-ridden. ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic sandcrawler-qa.pdf-thumbnail-180px-jpg --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 24 --topic sandcrawler-qa.unextracted + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic scholar-qa.sim-updates + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 12 --topic scholar-qa.update-docs --config compression.type=gzip --config cleanup.policy=compact --config retention.ms=7889400000 + ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.xml-doc --config compression.type=gzip --config cleanup.policy=compact ./kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 2 --partitions 6 --topic sandcrawler-qa.html-teixml --config compression.type=gzip --config cleanup.policy=compact diff --git a/notes/ingest/2020-10_unpaywall.md b/notes/ingest/2020-10_unpaywall.md index d474176..7b97d08 100644 --- a/notes/ingest/2020-10_unpaywall.md +++ b/notes/ingest/2020-10_unpaywall.md @@ -36,7 +36,117 @@ Proposed changes this time around: AND (ingest_file_result.status IS NULL OR ingest_file_result.status = 'no-capture') ) TO '/grande/snapshots/unpaywall_noingest_2020-10-09.rows.json'; + => COPY 4216339 ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_noingest_2020-10-09.rows.json | pv -l | shuf > /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json + => 4.22M 0:02:48 [ 25k/s] +Start small, to test no-capture behavior: + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | head -n1000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + +`no-capture` change looks good. Enqueue the whole batch: + + cat /grande/snapshots/unpaywall_noingest_2020-10-09.ingest_request.json | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc263.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + + +## Check Pre-Crawl Status + + SELECT ingest_file_result.status, COUNT(*) + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + GROUP BY status + ORDER BY COUNT DESC + LIMIT 20; + + + status | count + -------------------------+---------- + success | 23661282 + no-capture | 3015447 + no-pdf-link | 2302102 + redirect-loop | 1542566 + terminal-bad-status | 1044676 + wrong-mimetype | 114315 + link-loop | 36358 + cdx-error | 20150 + null-body | 14513 + wayback-error | 13644 + gateway-timeout | 3776 + spn2-cdx-lookup-failure | 1260 + petabox-error | 1171 + redirects-exceeded | 752 + invalid-host-resolution | 464 + spn2-error | 147 + bad-redirect | 131 + spn2-error:job-failed | 91 + wayback-content-error | 45 + timeout | 19 + (20 rows) + +## Dump Seedlist + +Dump rows: + + COPY ( + SELECT row_to_json(t1.*) + FROM ( + SELECT ingest_request.*, ingest_file_result as result + FROM ingest_request + LEFT JOIN ingest_file_result + ON ingest_file_result.ingest_type = ingest_request.ingest_type + AND ingest_file_result.base_url = ingest_request.base_url + WHERE + ingest_request.ingest_type = 'pdf' + AND ingest_request.link_source = 'unpaywall' + AND (ingest_file_result.status = 'no-capture' + OR ingest_file_result.status = 'cdx-error' + OR ingest_file_result.status = 'wayback-error' + OR ingest_file_result.status = 'gateway-timeout' + OR ingest_file_result.status = 'spn2-cdx-lookup-failure' + ) + AND ingest_request.base_url NOT LIKE '%journals.sagepub.com%' + AND ingest_request.base_url NOT LIKE '%pubs.acs.org%' + AND ingest_request.base_url NOT LIKE '%ahajournals.org%' + AND ingest_request.base_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_request.base_url NOT LIKE '%aip.scitation.org%' + AND ingest_request.base_url NOT LIKE '%academic.oup.com%' + AND ingest_request.base_url NOT LIKE '%tandfonline.com%' + AND ingest_file_result.terminal_url NOT LIKE '%journals.sagepub.com%' + AND ingest_file_result.terminal_url NOT LIKE '%pubs.acs.org%' + AND ingest_file_result.terminal_url NOT LIKE '%ahajournals.org%' + AND ingest_file_result.terminal_url NOT LIKE '%www.journal.csj.jp%' + AND ingest_file_result.terminal_url NOT LIKE '%aip.scitation.org%' + AND ingest_file_result.terminal_url NOT LIKE '%academic.oup.com%' + AND ingest_file_result.terminal_url NOT LIKE '%tandfonline.com%' + ) t1 + ) TO '/grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json'; + => 2,936,404 + + # TODO: in the future also exclude "www.archive.org" + +Prep ingest requests (for post-crawl use): + + ./scripts/ingestrequest_row2json.py /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | pv -l > /grande/snapshots/unpaywall_crawl_ingest_2020-11-02.json + +And actually dump seedlist(s): + + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg '"no-capture"' | jq -r .result.terminal_url | rg -v ^null$ | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.terminal_url.txt + cat /grande/snapshots/unpaywall_seedlist_2020-11-02.rows.json | rg -v '"no-capture"' | jq -r .base_url | sort -u -S 4G > /grande/snapshots/unpaywall_seedlist_2020-11-02.no_terminal_url.txt + + wc -l unpaywall_seedlist_2020-11-02.*.txt + 2701178 unpaywall_seedlist_2020-11-02.terminal_url.txt + 2713866 unpaywall_seedlist_2020-11-02.url.txt + +With things like jsessionid, suspect that crawling just the terminal URLs is +going to work better than both full and terminal. + +Finding a fraction of `no-capture` which have partial/stub URLs as terminal. + +TODO: investigate scale of partial/stub `terminal_url` (eg, not HTTP/S or FTP). |