diff options
author | Bryan Newbold <bnewbold@archive.org> | 2023-01-04 21:24:49 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2023-01-04 21:28:26 -0800 |
commit | 38ba3d6a9c5138afbd82a1a5025f43e08bbba6a2 (patch) | |
tree | 189cfc0845a2dd8c44b546ddaaf9c4e92a5a570e /notes/2022-12_bulk_index.md | |
parent | 011f309d5a0056f633b88de2b33e5714548eba56 (diff) | |
download | fatcat-scholar-38ba3d6a9c5138afbd82a1a5025f43e08bbba6a2.tar.gz fatcat-scholar-38ba3d6a9c5138afbd82a1a5025f43e08bbba6a2.zip |
commit old notes
Diffstat (limited to 'notes/2022-12_bulk_index.md')
-rw-r--r-- | notes/2022-12_bulk_index.md | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/notes/2022-12_bulk_index.md b/notes/2022-12_bulk_index.md new file mode 100644 index 0000000..b2c5cac --- /dev/null +++ b/notes/2022-12_bulk_index.md @@ -0,0 +1,72 @@ + +Running through a full end-to-end re-indexing. + + +## Fatcat Metadata Dumps + +Run following fatcat notes (elsewhere). + +Download to working machine: + + export JOBDIR=/kubwa/fatcat/2022-11-24 + mkdir -p $JOBDIR + cd $JOBDIR + wget -c https://archive.org/download/fatcat_bulk_exports_2022-11-24/release_export_expanded.json.gz + +## Microfilm + +Working directory: `aitio:/fast/fatcat-scholar`. + +Pulled latest git (`00d80752b7d83ae5a165540fbad641ddfc78b5f3`), and ran `make +dep`. + +Run: + + TODAY=2022-12-08 make issue-db + +Then, the SIM dump job, in parallel: + + export JOBDIR=/kubwa/scholar/2022-12-08 + mkdir -p $JOBDIR + pipenv shell + python -m fatcat_scholar.sim_pipeline run_print_issues \ + | shuf \ + | parallel -j16 --colsep "\t" python -m fatcat_scholar.sim_pipeline run_fetch_issue {1} {2} \ + | pv -l \ + | pigz \ + > $JOBDIR/sim_intermediate.2022-12-08.json.gz + => 45.4M 42:09:42 [ 298 /s] + +TODO: there were some old publications that should not be included... gazetteer? registers? + "Daily Gazetteer" (sim_daily-gazetteer) + +## Works Bulk Fetch + +First split up the release dump into chunks: + + export JOBDIR=/kubwa/scholar/2022-12-08 + mkdir -p $JOBDIR + cd $JOBDIR + zcat /kubwa/fatcat/2022-11-24/release_export_expanded.json.gz | split --lines 8000000 - release_export_expanded.split_ -d --additional-suffix .json + => done + +Note: more shards this time around (up to 23, not 21). + +Starting the below commands on 2022-12-21. + + export JOBDIR=/kubwa/scholar/2022-12-08 + cd /fast/fatcat-scholar + pipenv shell + export TMPDIR=/sandcrawler-db/tmp + # possibly re-export JOBDIR from above? + + # fetch + set -u -o pipefail + for SHARD in {00..23}; do + cat $JOBDIR/release_export_expanded.split_$SHARD.json \ + | parallel -j8 --line-buffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.work_pipeline run_releases \ + | pv -l \ + | pigz \ + > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP \ + && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz + done |