diff options
author | bnewbold <bnewbold@archive.org> | 2022-07-25 21:09:25 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2022-07-25 21:09:25 +0000 |
commit | 5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 (patch) | |
tree | 88b2a3a2ad2919cbef4f6acfdd5b986bda0baa72 /extra/dblp/prep_metadata.sh | |
parent | b3eddfc398129f2fdcf4737849d436327a67a74a (diff) | |
parent | b12d4f0bde96bfe39df1cc94a993da4b25e53304 (diff) | |
download | fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.tar.gz fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.zip |
Merge branch 'bnewbold-dblp-iteration' into 'master'
dblp import iteration
See merge request webgroup/fatcat!141
Diffstat (limited to 'extra/dblp/prep_metadata.sh')
-rwxr-xr-x | extra/dblp/prep_metadata.sh | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh new file mode 100755 index 00000000..21a50ab0 --- /dev/null +++ b/extra/dblp/prep_metadata.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine +#export FATCAT_API_HOST="https://api.fatcat.wiki/v0" + +set -e -u -o pipefail + +# ensure deps +#alias fd=fdfind +fd -h > /dev/null +fatcat-cli -h > /dev/null +pipenv -h > /dev/null + +# ensure pipenv is ready +pipenv install +pipenv run true + + +wget -c 'https://dblp.org/xml/dblp.dtd' +wget -c 'https://dblp.org/xml/dblp.xml.gz' + +zcat dblp.xml.gz > dblp.xml + +cd ../../python +pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json + +cd ../extra/dblp/ + +cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt + +mkdir -p journals +mkdir -p conf +mkdir -p series + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +# clean up any failed/empty files, then re-run the above parallel/wget command +find . -empty -type f -delete + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +find . -empty -type f -delete + +fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv + +cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz |