diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 13:55:57 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-17 23:03:08 -0800 |
commit | 8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch) | |
tree | 8e3f1564529abb5f3a3fd5890f1e222d787b581f /extra/dblp/dblp_html_extract.py | |
parent | 9451b3063c2d446748db74027c40c13ee69c24fb (diff) | |
download | fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip |
dblp: script and notes on container metadata generation
Diffstat (limited to 'extra/dblp/dblp_html_extract.py')
-rwxr-xr-x | extra/dblp/dblp_html_extract.py | 86 |
1 files changed, 86 insertions, 0 deletions
diff --git a/extra/dblp/dblp_html_extract.py b/extra/dblp/dblp_html_extract.py new file mode 100755 index 00000000..369eac1a --- /dev/null +++ b/extra/dblp/dblp_html_extract.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +""" +Run this script and pass a list of filenames (with or without .html) to stdin, +and this will output JSON objects to stdout. Eg: + + fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +Requires virtualenv with selectolax. +""" + +import sys +import json + +from selectolax.parser import HTMLParser + + +def parse_html(path: str) -> dict: + """ + Parses from HTML: + + - key + - title + - issns (list) + - wikidata_qid + - homepage_url + - acronym (?) + + TODO: publisher? + """ + key = path.replace('.html', '') + if not len(key.split('/')) == 2: + print(key, file=sys.stderr) + return {} + meta = dict(dblp_prefix=key, issns=[]) + + try: + with open(path, 'r') as html_file: + doc = HTMLParser(html_file.read()) + except FileNotFoundError: + return {} + + elem = doc.css_first('header#headline h1') + if elem and elem.text(): + meta['title'] = elem.text() + if meta['title'].endswith(')') and meta['title'].count('(') == 1: + meta['acronym'] = meta['title'].split('(')[-1][:-1] + meta['title'] = meta['title'].split('(')[0].strip() + + # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs"> + # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs"> + elems = doc.css('header#headline a[itemprop="sameAs"]') or [] + for elem in elems: + if not elem.attributes.get('href'): + continue + url = elem.attributes['href'] + if "://portal.issn.org/" in url: + issn = url.split('/')[-1].strip() + if len(issn) == 9: + meta['issns'].append(issn) + else: + print(issn, file=sys.stderr) + elif "://www.wikidata.org/entity/Q" in url: + meta['wikidata_qid'] = url.split('/')[-1] + assert 'Q' in meta['wikidata_qid'] + + # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a> + elem = doc.css_first('header#headline a[itemprop="url"]') + if elem and elem.attributes.get('href'): + meta['homepage_url'] = elem.attributes['href'] + + return meta + +def run() -> None: + for path in sys.stdin: + path = path.strip() + if not path: + continue + if not path.endswith(".html"): + path += ".html" + obj = parse_html(path) + if obj: + print(json.dumps(obj, sort_keys=True)) + +if __name__=='__main__': + run() |