extra/dblp/dblp_html_extract.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

#!/usr/bin/env python3

"""
Run this script and pass a list of filenames (with or without .html) to stdin,
and this will output JSON objects to stdout. Eg:

    fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json

Requires virtualenv with selectolax.
"""

import sys
import json

from selectolax.parser import HTMLParser


def parse_html(path: str) -> dict:
    """
    Parses from HTML:

    - key
    - title
    - issns (list)
    - wikidata_qid
    - homepage_url
    - acronym (?)

    TODO: publisher?
    """
    key = path.replace('.html', '')
    if not len(key.split('/')) == 2:
        print(key, file=sys.stderr)
        return {}
    meta = dict(dblp_prefix=key, issns=[])

    try:
        with open(path, 'r') as html_file:
            doc = HTMLParser(html_file.read())
    except FileNotFoundError:
        return {}

    elem = doc.css_first('header#headline h1')
    if elem and elem.text():
        meta['title'] = elem.text()
        if meta['title'].endswith(')') and meta['title'].count('(') == 1:
            meta['acronym'] = meta['title'].split('(')[-1][:-1]
            meta['title'] = meta['title'].split('(')[0].strip()

    # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
    # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
    elems = doc.css('header#headline a[itemprop="sameAs"]') or []
    for elem in elems:
        if not elem.attributes.get('href'):
            continue
        url = elem.attributes['href']
        if "://portal.issn.org/" in url:
            issn = url.split('/')[-1].strip()
            if len(issn) == 9:
                meta['issns'].append(issn)
            else:
                print(issn, file=sys.stderr)
        elif "://www.wikidata.org/entity/Q" in url:
            meta['wikidata_qid'] = url.split('/')[-1]
            assert 'Q' in meta['wikidata_qid']

    # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
    elem = doc.css_first('header#headline a[itemprop="url"]')
    if elem and elem.attributes.get('href'):
        meta['homepage_url'] = elem.attributes['href']
            
    return meta

def run() -> None:
    for path in sys.stdin:
        path = path.strip()
        if not path:
            continue
        if not path.endswith(".html"):
            path += ".html"
        obj = parse_html(path)
        if obj:
            print(json.dumps(obj, sort_keys=True))

if __name__=='__main__':
    run()