#!/usr/bin/env python3 """ Run this script and pass a list of filenames (with or without .html) to stdin, and this will output JSON objects to stdout. Eg: fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json Requires virtualenv with selectolax. """ import sys import json from selectolax.parser import HTMLParser def parse_html(path: str) -> dict: """ Parses from HTML: - key - title - issns (list) - wikidata_qid - homepage_url - acronym (?) TODO: publisher? """ key = path.replace('.html', '') if not len(key.split('/')) == 2: print(key, file=sys.stderr) return {} meta = dict(dblp_prefix=key, issns=[]) try: with open(path, 'r') as html_file: doc = HTMLParser(html_file.read()) except FileNotFoundError: return {} elem = doc.css_first('header#headline h1') if elem and elem.text(): meta['title'] = elem.text() if meta['title'].endswith(')') and meta['title'].count('(') == 1: meta['acronym'] = meta['title'].split('(')[-1][:-1] meta['title'] = meta['title'].split('(')[0].strip() # # elems = doc.css('header#headline a[itemprop="sameAs"]') or [] for elem in elems: if not elem.attributes.get('href'): continue url = elem.attributes['href'] if "://portal.issn.org/" in url: issn = url.split('/')[-1].strip() if len(issn) == 9: meta['issns'].append(issn) else: print(issn, file=sys.stderr) elif "://www.wikidata.org/entity/Q" in url: meta['wikidata_qid'] = url.split('/')[-1] assert 'Q' in meta['wikidata_qid'] # elem = doc.css_first('header#headline a[itemprop="url"]') if elem and elem.attributes.get('href'): meta['homepage_url'] = elem.attributes['href'] return meta def run() -> None: for path in sys.stdin: path = path.strip() if not path: continue if not path.endswith(".html"): path += ".html" obj = parse_html(path) if obj: print(json.dumps(obj, sort_keys=True)) if __name__=='__main__': run()