diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 12:29:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2022-07-19 12:29:37 -0700 |
commit | 519733b77832ccbf97491a794e7f10884e39acdb (patch) | |
tree | 9fef34e691b4370e372cdf9d01457017835defdf /extra/dblp/prep_metadata.sh | |
parent | 94cce3ebe325f7601feee7dbe7ab8b24aa2492ee (diff) | |
download | fatcat-519733b77832ccbf97491a794e7f10884e39acdb.tar.gz fatcat-519733b77832ccbf97491a794e7f10884e39acdb.zip |
dblp: updated ingest pipeline
Diffstat (limited to 'extra/dblp/prep_metadata.sh')
-rwxr-xr-x | extra/dblp/prep_metadata.sh | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh new file mode 100755 index 00000000..21a50ab0 --- /dev/null +++ b/extra/dblp/prep_metadata.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine +#export FATCAT_API_HOST="https://api.fatcat.wiki/v0" + +set -e -u -o pipefail + +# ensure deps +#alias fd=fdfind +fd -h > /dev/null +fatcat-cli -h > /dev/null +pipenv -h > /dev/null + +# ensure pipenv is ready +pipenv install +pipenv run true + + +wget -c 'https://dblp.org/xml/dblp.dtd' +wget -c 'https://dblp.org/xml/dblp.xml.gz' + +zcat dblp.xml.gz > dblp.xml + +cd ../../python +pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json + +cd ../extra/dblp/ + +cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt + +mkdir -p journals +mkdir -p conf +mkdir -p series + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +# clean up any failed/empty files, then re-run the above parallel/wget command +find . -empty -type f -delete + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +find . -empty -type f -delete + +fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv + +cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz |