update dblp pre-import notes and pipenv python version (3.8)

author: Bryan Newbold <bnewbold@robocracy.org> 2021-06-03 10:57:04 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2021-06-03 10:57:04 -0700
commit: 038b3a318440798df8ff8498454dbd251c571ff6 (patch)
tree: 92ab994602d6c83f5bb193cb1cc1e199801ec088 /extra
parent: 02f19fb116f3948f9741a981ff1aea21ef9cbe67 (diff)
download: fatcat-038b3a318440798df8ff8498454dbd251c571ff6.tar.gz
fatcat-038b3a318440798df8ff8498454dbd251c571ff6.zip
2 files changed, 11 insertions, 6 deletions
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
index a191e76f..dbf86ac0 100644
--- a/extra/dblp/Pipfile
+++ b/extra/dblp/Pipfile
@@ -9,4 +9,4 @@ selectolax = "*"
 [dev-packages]
 
 [requires]
-python_version = "3.7"
+python_version = "3.8"
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
index f2fd02da..e6ccce4f 100644
--- a/extra/dblp/README.md
+++ b/extra/dblp/README.md
@@ -10,15 +10,20 @@ necessary.
 
 ## Quick Bootstrap Commands
 
+Set up a working directory somewhere:
+
+    export DBLP_DIR=/data/dblp
+
 Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform
 and dump release entities in JSON; this takes some time:
 
-    ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json
+    export FATCAT_API_AUTH_TOKEN=[...]
+    ./fatcat_import.py dblp-release $DBLP_DIR/dblp.xml --dump-json-mode | pv -l > $DBLP_DIR/dblp_releases.json
 
 Next extract the unique set of dblp identifier prefixes, which will be used as
 container identifiers:
 
-    cat /data/dblp/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > /data/dblp/prefix_list.txt
+    cat $DBLP_DIR/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > $DBLP_DIR/prefix_list.txt
 
 Then fetch HTML documents from dblp.org for each prefix. Note that currently
 only single-level containers will download successfully, and only journals,
@@ -29,15 +34,15 @@ the future.
     mkdir -p conf
     mkdir -p series
 
-    shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+    shuf $DBLP_DIR/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
 
     # clean up any failed/empty files, then re-run the above parallel/wget command
     find . -empty -type f -delete
 
 Using the python script in this directory, extract metadata from these HTML documents:
 
-    fd html conf/ journals/ series/ | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+    fd html conf/ journals/ series/ | /srv/fatcat/src/extra/dblp/dblp_html_extract.py | pv -l > dblp_container_meta.json
 
 This can be imported into fatcat using the dblp-container importer:
 
-    ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json
+    ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file $DBLP_DIR/existing_dblp_containers.tsv --dblp-container-map-output $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp_container_meta.json
author	Bryan Newbold <bnewbold@robocracy.org>	2021-06-03 10:57:04 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2021-06-03 10:57:04 -0700
commit	038b3a318440798df8ff8498454dbd251c571ff6 (patch)
tree	92ab994602d6c83f5bb193cb1cc1e199801ec088 /extra
parent	02f19fb116f3948f9741a981ff1aea21ef9cbe67 (diff)
download	fatcat-038b3a318440798df8ff8498454dbd251c571ff6.tar.gz fatcat-038b3a318440798df8ff8498454dbd251c571ff6.zip