aboutsummaryrefslogtreecommitdiffstats
path: root/extra/dblp/prep_metadata.sh
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:29:37 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:29:37 -0700
commit519733b77832ccbf97491a794e7f10884e39acdb (patch)
tree9fef34e691b4370e372cdf9d01457017835defdf /extra/dblp/prep_metadata.sh
parent94cce3ebe325f7601feee7dbe7ab8b24aa2492ee (diff)
downloadfatcat-519733b77832ccbf97491a794e7f10884e39acdb.tar.gz
fatcat-519733b77832ccbf97491a794e7f10884e39acdb.zip
dblp: updated ingest pipeline
Diffstat (limited to 'extra/dblp/prep_metadata.sh')
-rwxr-xr-xextra/dblp/prep_metadata.sh48
1 files changed, 48 insertions, 0 deletions
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh
new file mode 100755
index 00000000..21a50ab0
--- /dev/null
+++ b/extra/dblp/prep_metadata.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"
+
+set -e -u -o pipefail
+
+# ensure deps
+#alias fd=fdfind
+fd -h > /dev/null
+fatcat-cli -h > /dev/null
+pipenv -h > /dev/null
+
+# ensure pipenv is ready
+pipenv install
+pipenv run true
+
+
+wget -c 'https://dblp.org/xml/dblp.dtd'
+wget -c 'https://dblp.org/xml/dblp.xml.gz'
+
+zcat dblp.xml.gz > dblp.xml
+
+cd ../../python
+pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json
+
+cd ../extra/dblp/
+
+cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt
+
+mkdir -p journals
+mkdir -p conf
+mkdir -p series
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+# clean up any failed/empty files, then re-run the above parallel/wget command
+find . -empty -type f -delete
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+find . -empty -type f -delete
+
+fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+
+cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz