aboutsummaryrefslogtreecommitdiffstats
path: root/extra/dblp/prep_metadata.sh
blob: 21a50ab05bbbb880e0811ee51cc050663e8c56a1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env bash

# run this as 'fatcat' user on a production machine
#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"

set -e -u -o pipefail

# ensure deps
#alias fd=fdfind
fd -h > /dev/null
fatcat-cli -h > /dev/null
pipenv -h > /dev/null

# ensure pipenv is ready
pipenv install
pipenv run true


wget -c 'https://dblp.org/xml/dblp.dtd'
wget -c 'https://dblp.org/xml/dblp.xml.gz'

zcat dblp.xml.gz > dblp.xml

cd ../../python
pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json

cd ../extra/dblp/

cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt

mkdir -p journals
mkdir -p conf
mkdir -p series

shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html

# clean up any failed/empty files, then re-run the above parallel/wget command
find . -empty -type f -delete

shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html

find . -empty -type f -delete

fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json

fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv

cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz