From 038b3a318440798df8ff8498454dbd251c571ff6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 3 Jun 2021 10:57:04 -0700 Subject: update dblp pre-import notes and pipenv python version (3.8) --- extra/dblp/Pipfile | 2 +- extra/dblp/README.md | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'extra/dblp') diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile index a191e76f..dbf86ac0 100644 --- a/extra/dblp/Pipfile +++ b/extra/dblp/Pipfile @@ -9,4 +9,4 @@ selectolax = "*" [dev-packages] [requires] -python_version = "3.7" +python_version = "3.8" diff --git a/extra/dblp/README.md b/extra/dblp/README.md index f2fd02da..e6ccce4f 100644 --- a/extra/dblp/README.md +++ b/extra/dblp/README.md @@ -10,15 +10,20 @@ necessary. ## Quick Bootstrap Commands +Set up a working directory somewhere: + + export DBLP_DIR=/data/dblp + Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform and dump release entities in JSON; this takes some time: - ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json + export FATCAT_API_AUTH_TOKEN=[...] + ./fatcat_import.py dblp-release $DBLP_DIR/dblp.xml --dump-json-mode | pv -l > $DBLP_DIR/dblp_releases.json Next extract the unique set of dblp identifier prefixes, which will be used as container identifiers: - cat /data/dblp/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > /data/dblp/prefix_list.txt + cat $DBLP_DIR/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > $DBLP_DIR/prefix_list.txt Then fetch HTML documents from dblp.org for each prefix. Note that currently only single-level containers will download successfully, and only journals, @@ -29,15 +34,15 @@ the future. mkdir -p conf mkdir -p series - shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + shuf $DBLP_DIR/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html # clean up any failed/empty files, then re-run the above parallel/wget command find . -empty -type f -delete Using the python script in this directory, extract metadata from these HTML documents: - fd html conf/ journals/ series/ | ./dblp_html_extract.py | pv -l > dblp_container_meta.json + fd html conf/ journals/ series/ | /srv/fatcat/src/extra/dblp/dblp_html_extract.py | pv -l > dblp_container_meta.json This can be imported into fatcat using the dblp-container importer: - ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file $DBLP_DIR/existing_dblp_containers.tsv --dblp-container-map-output $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp_container_meta.json -- cgit v1.2.3