dblp: script and notes on container metadata generation

author: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 13:55:57 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-12-17 23:03:08 -0800
commit: 8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch)
tree: 8e3f1564529abb5f3a3fd5890f1e222d787b581f /extra/dblp/dblp_html_extract.py
parent: 9451b3063c2d446748db74027c40c13ee69c24fb (diff)
download: fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz
fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip
1 files changed, 86 insertions, 0 deletions
diff --git a/extra/dblp/dblp_html_extract.py b/extra/dblp/dblp_html_extract.py
new file mode 100755
index 00000000..369eac1a
--- /dev/null
+++ b/extra/dblp/dblp_html_extract.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+"""
+Run this script and pass a list of filenames (with or without .html) to stdin,
+and this will output JSON objects to stdout. Eg:
+
+    fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+Requires virtualenv with selectolax.
+"""
+
+import sys
+import json
+
+from selectolax.parser import HTMLParser
+
+
+def parse_html(path: str) -> dict:
+    """
+    Parses from HTML:
+
+    - key
+    - title
+    - issns (list)
+    - wikidata_qid
+    - homepage_url
+    - acronym (?)
+
+    TODO: publisher?
+    """
+    key = path.replace('.html', '')
+    if not len(key.split('/')) == 2:
+        print(key, file=sys.stderr)
+        return {}
+    meta = dict(dblp_prefix=key, issns=[])
+
+    try:
+        with open(path, 'r') as html_file:
+            doc = HTMLParser(html_file.read())
+    except FileNotFoundError:
+        return {}
+
+    elem = doc.css_first('header#headline h1')
+    if elem and elem.text():
+        meta['title'] = elem.text()
+        if meta['title'].endswith(')') and meta['title'].count('(') == 1:
+            meta['acronym'] = meta['title'].split('(')[-1][:-1]
+            meta['title'] = meta['title'].split('(')[0].strip()
+
+    # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
+    # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
+    elems = doc.css('header#headline a[itemprop="sameAs"]') or []
+    for elem in elems:
+        if not elem.attributes.get('href'):
+            continue
+        url = elem.attributes['href']
+        if "://portal.issn.org/" in url:
+            issn = url.split('/')[-1].strip()
+            if len(issn) == 9:
+                meta['issns'].append(issn)
+            else:
+                print(issn, file=sys.stderr)
+        elif "://www.wikidata.org/entity/Q" in url:
+            meta['wikidata_qid'] = url.split('/')[-1]
+            assert 'Q' in meta['wikidata_qid']
+
+    # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
+    elem = doc.css_first('header#headline a[itemprop="url"]')
+    if elem and elem.attributes.get('href'):
+        meta['homepage_url'] = elem.attributes['href']
+            
+    return meta
+
+def run() -> None:
+    for path in sys.stdin:
+        path = path.strip()
+        if not path:
+            continue
+        if not path.endswith(".html"):
+            path += ".html"
+        obj = parse_html(path)
+        if obj:
+            print(json.dumps(obj, sort_keys=True))
+
+if __name__=='__main__':
+    run()
author	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 13:55:57 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-12-17 23:03:08 -0800
commit	8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch)
tree	8e3f1564529abb5f3a3fd5890f1e222d787b581f /extra/dblp/dblp_html_extract.py
parent	9451b3063c2d446748db74027c40c13ee69c24fb (diff)
download	fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip