From 8c2394a5e0ae73c5d534bed30e339ab5004d11e1 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 17 Dec 2020 13:55:57 -0800
Subject: dblp: script and notes on container metadata generation

---
 extra/dblp/README.md | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 extra/dblp/README.md

(limited to 'extra/dblp/README.md')

diff --git a/extra/dblp/README.md b/extra/dblp/README.md
new file mode 100644
index 00000000..d74f8bf9
--- /dev/null
+++ b/extra/dblp/README.md
@@ -0,0 +1,34 @@
+
+This file describes hacks used to import dblp container metadata.
+
+
+## Quick Bootstrap Commands
+
+Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform
+and dump release entities in JSON; this takes some time:
+
+    ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json
+
+Next extract the unique set of dblp identifier prefixes, which will be used as
+container identifiers:
+
+    cat /data/dblp/dblp_releases.json | jq ._dblp_prefix | grep -v ^none | sort -u > /data/dblp/prefix_list.txt
+
+Then fetch HTML documents from dblp.org for each prefix:
+
+    mkdir -p journals
+    mkdir -p conf
+    mkdir -p series
+
+    shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+    # clean up any failed/empty files, then re-run the above parallel/wget command
+    find . -empty -type f -delete
+
+Using the python script in this directory, extract metadata from these HTML documents:
+
+    fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+This can be imported into fatcat using the dblp-container importer:
+
+    ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json
-- 
cgit v1.2.3