dblp: script and notes on container metadata generation

author: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 13:55:57 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 23:03:08 -0800
commit: 8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch)
tree: 8e3f1564529abb5f3a3fd5890f1e222d787b581f
parent: 9451b3063c2d446748db74027c40c13ee69c24fb (diff)
download: fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz
fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip
4 files changed, 134 insertions, 0 deletions
diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore
new file mode 100644
index 00000000..8847a157
--- /dev/null
+++ b/extra/dblp/.gitignore
@@ -0,0 +1,3 @@
+conf/
+journals/
+series/
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
new file mode 100644
index 00000000..b9ba84f6
--- /dev/null
+++ b/extra/dblp/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.7"
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
new file mode 100644
index 00000000..d74f8bf9
--- /dev/null
+++ b/extra/dblp/README.md
@@ -0,0 +1,34 @@
+
+This file describes hacks used to import dblp container metadata.
+
+
+## Quick Bootstrap Commands
+
+Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform
+and dump release entities in JSON; this takes some time:
+
+    ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json
+
+Next extract the unique set of dblp identifier prefixes, which will be used as
+container identifiers:
+
+    cat /data/dblp/dblp_releases.json | jq ._dblp_prefix | grep -v ^none | sort -u > /data/dblp/prefix_list.txt
+
+Then fetch HTML documents from dblp.org for each prefix:
+
+    mkdir -p journals
+    mkdir -p conf
+    mkdir -p series
+
+    shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+    # clean up any failed/empty files, then re-run the above parallel/wget command
+    find . -empty -type f -delete
+
+Using the python script in this directory, extract metadata from these HTML documents:
+
+    fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+This can be imported into fatcat using the dblp-container importer:
+
+    ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json
diff --git a/extra/dblp/dblp_html_extract.py b/extra/dblp/dblp_html_extract.py
new file mode 100755
index 00000000..369eac1a
--- /dev/null
+++ b/extra/dblp/dblp_html_extract.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+"""
+Run this script and pass a list of filenames (with or without .html) to stdin,
+and this will output JSON objects to stdout. Eg:
+
+    fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+Requires virtualenv with selectolax.
+"""
+
+import sys
+import json
+
+from selectolax.parser import HTMLParser
+
+
+def parse_html(path: str) -> dict:
+    """
+    Parses from HTML:
+
+    - key
+    - title
+    - issns (list)
+    - wikidata_qid
+    - homepage_url
+    - acronym (?)
+
+    TODO: publisher?
+    """
+    key = path.replace('.html', '')
+    if not len(key.split('/')) == 2:
+        print(key, file=sys.stderr)
+        return {}
+    meta = dict(dblp_prefix=key, issns=[])
+
+    try:
+        with open(path, 'r') as html_file:
+            doc = HTMLParser(html_file.read())
+    except FileNotFoundError:
+        return {}
+
+    elem = doc.css_first('header#headline h1')
+    if elem and elem.text():
+        meta['title'] = elem.text()
+        if meta['title'].endswith(')') and meta['title'].count('(') == 1:
+            meta['acronym'] = meta['title'].split('(')[-1][:-1]
+            meta['title'] = meta['title'].split('(')[0].strip()
+
+    # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
+    # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
+    elems = doc.css('header#headline a[itemprop="sameAs"]') or []
+    for elem in elems:
+        if not elem.attributes.get('href'):
+            continue
+        url = elem.attributes['href']
+        if "://portal.issn.org/" in url:
+            issn = url.split('/')[-1].strip()
+            if len(issn) == 9:
+                meta['issns'].append(issn)
+            else:
+                print(issn, file=sys.stderr)
+        elif "://www.wikidata.org/entity/Q" in url:
+            meta['wikidata_qid'] = url.split('/')[-1]
+            assert 'Q' in meta['wikidata_qid']
+
+    # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
+    elem = doc.css_first('header#headline a[itemprop="url"]')
+    if elem and elem.attributes.get('href'):
+        meta['homepage_url'] = elem.attributes['href']
+            
+    return meta
+
+def run() -> None:
+    for path in sys.stdin:
+        path = path.strip()
+        if not path:
+            continue
+        if not path.endswith(".html"):
+            path += ".html"
+        obj = parse_html(path)
+        if obj:
+            print(json.dumps(obj, sort_keys=True))
+
+if __name__=='__main__':
+    run()
author	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 13:55:57 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 23:03:08 -0800
commit	8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch)
tree	8e3f1564529abb5f3a3fd5890f1e222d787b581f
parent	9451b3063c2d446748db74027c40c13ee69c24fb (diff)
download	fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip