diff options
Diffstat (limited to 'notes/2021-12_works.md')
-rw-r--r-- | notes/2021-12_works.md | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/notes/2021-12_works.md b/notes/2021-12_works.md new file mode 100644 index 0000000..1361387 --- /dev/null +++ b/notes/2021-12_works.md @@ -0,0 +1,189 @@ + +## Bulk Fetch + +Implemented HTTP sessions for postgrest updates, which could be a significant +performance improvement. + + export JOBDIR=/kubwa/scholar/2021-12-08 + mkdir -p $JOBDIR + cd $JOBDIR + zcat /kubwa/fatcat/2021-12-01/release_export_expanded.json.gz | split --lines 8000000 - release_export_expanded.split_ -d --additional-suffix .json + + cd /fast/fatcat-scholar + git pull + pipenv shell + export TMPDIR=/sandcrawler-db/tmp + # possibly re-export JOBDIR from above? + + # fetch + set -u -o pipefail + for SHARD in {00..21}; do + cat $JOBDIR/release_export_expanded.split_$SHARD.json \ + | parallel -j8 --line-buffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.work_pipeline run_releases \ + | pv -l \ + | pigz \ + > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP \ + && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz + done + + # dump refs + set -u -o pipefail + for SHARD in {00..21}; do + zcat $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz \ + | pv -l \ + | parallel -j12 --linebuffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.transform run_refs \ + | pigz \ + > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP \ + && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz + done + +This entire progress took almost weeks, from 2021-12-09 to 2021-12-28. There +were some delays (datacenter power outage) which cost a couple days. Reference +dumping took about two days, the fetching probably could have completed in 14 +days if run smoothly (no interruptions). Overall not bad though. This size of +shard seems to work well. + +## Upload to archive.org + + export JOBDIR=/kubwa/scholar/2021-12-08 + export BASENAME=scholar_corpus_bundle_2021-12-08 + for SHARD in {00..21}; do + ia upload ${BASENAME}_split-${SHARD} $JOBDIR/fatcat_scholar_work_fulltext.split_${SHARD}.json.gz -m collection:"scholarly-tdm" --checksum + done + + ia upload scholar_corpus_refs_2021-12-08 fatcat_scholar_work_fulltext.split_*.refs.json.gz -m collection:"scholarly-tdm" --checksum + +## Indexing (including SIM pages) + +Where and how are we going to index? Total size of new scholar index is estimated +to be 2.5 TByte (current index is 2.3 TByte). Remember that we split scholar +across `svc500` and `svc097`. `svc500` has the scholar primary shards and +`svc097` has the replica shards. + +One proposal is to drop the replicas from `svc097` and start indexing there; +the machine would have no other indices so disruption would be minimal. Might +also point load balancer to `svc500` as the primary. + +Steps: + +- stop `scholar-index-docs-worker@*` on `svc097` +- update haproxy config to have `svc500` and scholar primary, `svc097` as backup +- create scholar indexes + +Running these commands on `wbgrp-svc097`: + + http put ":9200/scholar_fulltext_v01_20211208?include_type_name=true" < schema/scholar_fulltext.v01.json + + http put ":9200/scholar_fulltext_v01_20211208/_settings" index.routing.allocation.include._name=wbgrp-svc097 + + # first SIM pages + ssh aitio.us.archive.org cat /kubwa/scholar/2021-12-01/sim_intermediate.2021-12-01.json.gz \ + | gunzip \ + | sudo -u fatcat parallel -j8 --compress --tmpdir /srv/tmp/ --line-buffer --round-robin --pipe pipenv run python -m fatcat_scholar.transform run_transform \ + | pv -l \ + | esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01_20211208 -type _doc \ + 2> /tmp/error.txt 1> /tmp/output.txt + => 41.8M 16:37:13 [ 698 /s] + + # then works + ssh aitio.us.archive.org cat /kubwa/scholar/2021-12-08/fatcat_scholar_work_fulltext.split_{00..21}.json.gz \ + | gunzip \ + | sudo -u fatcat parallel -j8 --compress --tmpdir /srv/tmp/ --line-buffer --round-robin --pipe pipenv run python -m fatcat_scholar.transform run_transform \ + | pv -l \ + | esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01_20211208 -type _doc \ + 2> /tmp/error.txt 1> /tmp/output.txt + +Part way through there was a power outage, and had to continue. + + # 2022/01/14 17:07:59 indexing failed with 503 Service Unavailable: {"error":{"root_cause":[{"type":"cluster_block_exception","reason":"blocked by: [SERVICE_UNAVAILABLE/2/no master];"}],"type":"cluster_block_exception","reason":"blocked by: [SERVICE_UNAVAILABLE/2/no master];"},"status":503} + # 112M 72:11:17 [ 433 /s] + # this means 00..13 finished successfully + + ssh aitio.us.archive.org cat /kubwa/scholar/2021-12-08/fatcat_scholar_work_fulltext.split_{14..21}.json.gz \ + | gunzip \ + | sudo -u fatcat parallel -j8 --compress --tmpdir /srv/tmp/ --line-buffer --round-robin --pipe pipenv run python -m fatcat_scholar.transform run_transform \ + | pv -l \ + | esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01_20211208 -type _doc \ + 2> /tmp/error2.txt 1> /tmp/output2.txt + # ... following the above ... + # 61.0M 40:51:18 [ 414 /s] + +Changes while indexing: + +- SIM indexing command failed at just 70k docs the first time, because of an + issue with multiple publishers in item metadata. updated transform, ran a + million pages through (to /dev/null) as testing, then restarted +- power outage happened, as noted above + +Index size at this point (bulk indexing complete): + + http get :9200/_cat/indices | rg scholar + green open scholar_fulltext_v01_20210128 OGyck2ppQhaTh6N-u87xSg 12 0 183923045 19144541 2.3tb 2.3tb + green open scholar_fulltext_v01_20211208 _u2PE-oTRcSktI5mxDrQPg 12 0 212298463 553388 2tb 2tb + +## Before/After Stats + +Brainstorming: + +- total, works, and `sim_pages` in index +- for works, break down of access types (SIM, web/archive.org) +- total public domain and public domain with access (new pre-1927 wall) +- sitemap size + +Note: added commas to the below output, and summarized as "old" / "new" for the +two indices. remember that "old" index at this point had a couple months of +additional daily index results. + + http get :9200/scholar_fulltext_v01_20211208/_count | jq .count + old: 183,923,045 + new: 212,298,463 + +28,375,418 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="doc_type:sim_page" | jq .count + old: 10,448,586 + new: 40,559,249 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="doc_type:work" | jq .count + old: 173,474,459 + new: 171,739,214 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:*" | jq .count + old: 44,357,232 + new: 74,840,284 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:wayback" | jq .count + old: 31,693,599 + new: 30,926,112 + new (final): 31,832,082 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:ia_sim AND doc_type:work" | jq .count + old: 51,118 + new: 1,189,974 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:* AND year:<=1925" | jq .count + old: 3,707,248 + new: 18,361,450 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:* AND year:<=1927" | jq .count + old: 3,837,502 + new: 18,850,927 + + http get :9200/scholar_fulltext_v01_20211208/_count q=="fulltext.access_type:* AND doc_type:work AND year:<=1925" | jq .count + old: 2,261,426 + new: 2,222,627 + + http get :9200/scholar_fulltext_v01_20211208/_count q=="fulltext.access_type:* AND doc_type:work AND year:<=1927" | jq .count + old: 2,288,760 + new: 2,268,425 + + http get :9200/scholar_fulltext_v01_20210128/_count q=="fulltext.access_type:* AND doc_type:work" | jq .count + old: 33,908,646 + new (final): 35,190,311 + +Sitemap size: + + cat sitemap-access-00*.txt | wc -l + 2021-06-23: 17,900,935 + 2022-01-20: 23.9M 6:27:27 [1.03k/s] + + works 2022-01-20: 23.9M 6:28:50 [1.03k/s] |