aboutsummaryrefslogtreecommitdiffstats
path: root/extra/dblp/prep_metadata.sh
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2022-07-25 21:09:25 +0000
committerbnewbold <bnewbold@archive.org>2022-07-25 21:09:25 +0000
commit5ecf72cbb488a9a50eb869ea55b4c2bfc1440731 (patch)
tree88b2a3a2ad2919cbef4f6acfdd5b986bda0baa72 /extra/dblp/prep_metadata.sh
parentb3eddfc398129f2fdcf4737849d436327a67a74a (diff)
parentb12d4f0bde96bfe39df1cc94a993da4b25e53304 (diff)
downloadfatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.tar.gz
fatcat-5ecf72cbb488a9a50eb869ea55b4c2bfc1440731.zip
Merge branch 'bnewbold-dblp-iteration' into 'master'
dblp import iteration See merge request webgroup/fatcat!141
Diffstat (limited to 'extra/dblp/prep_metadata.sh')
-rwxr-xr-xextra/dblp/prep_metadata.sh48
1 files changed, 48 insertions, 0 deletions
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh
new file mode 100755
index 00000000..21a50ab0
--- /dev/null
+++ b/extra/dblp/prep_metadata.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"
+
+set -e -u -o pipefail
+
+# ensure deps
+#alias fd=fdfind
+fd -h > /dev/null
+fatcat-cli -h > /dev/null
+pipenv -h > /dev/null
+
+# ensure pipenv is ready
+pipenv install
+pipenv run true
+
+
+wget -c 'https://dblp.org/xml/dblp.dtd'
+wget -c 'https://dblp.org/xml/dblp.xml.gz'
+
+zcat dblp.xml.gz > dblp.xml
+
+cd ../../python
+pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json
+
+cd ../extra/dblp/
+
+cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt
+
+mkdir -p journals
+mkdir -p conf
+mkdir -p series
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+# clean up any failed/empty files, then re-run the above parallel/wget command
+find . -empty -type f -delete
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+find . -empty -type f -delete
+
+fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+
+cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz