#!/usr/bin/env bash # run this as 'fatcat' user on a production machine #export FATCAT_API_HOST="https://api.fatcat.wiki/v0" set -e -u -o pipefail # ensure deps #alias fd=fdfind fd -h > /dev/null fatcat-cli -h > /dev/null pipenv -h > /dev/null # ensure pipenv is ready pipenv install pipenv run true wget -c 'https://dblp.org/xml/dblp.dtd' wget -c 'https://dblp.org/xml/dblp.xml.gz' zcat dblp.xml.gz > dblp.xml cd ../../python pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json cd ../extra/dblp/ cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt mkdir -p journals mkdir -p conf mkdir -p series shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html # clean up any failed/empty files, then re-run the above parallel/wget command find . -empty -type f -delete shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html find . -empty -type f -delete fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz