blob: 369eac1a411de200080741879e01b32244fea229 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
#!/usr/bin/env python3
"""
Run this script and pass a list of filenames (with or without .html) to stdin,
and this will output JSON objects to stdout. Eg:
fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
Requires virtualenv with selectolax.
"""
import sys
import json
from selectolax.parser import HTMLParser
def parse_html(path: str) -> dict:
"""
Parses from HTML:
- key
- title
- issns (list)
- wikidata_qid
- homepage_url
- acronym (?)
TODO: publisher?
"""
key = path.replace('.html', '')
if not len(key.split('/')) == 2:
print(key, file=sys.stderr)
return {}
meta = dict(dblp_prefix=key, issns=[])
try:
with open(path, 'r') as html_file:
doc = HTMLParser(html_file.read())
except FileNotFoundError:
return {}
elem = doc.css_first('header#headline h1')
if elem and elem.text():
meta['title'] = elem.text()
if meta['title'].endswith(')') and meta['title'].count('(') == 1:
meta['acronym'] = meta['title'].split('(')[-1][:-1]
meta['title'] = meta['title'].split('(')[0].strip()
# <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
# <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
elems = doc.css('header#headline a[itemprop="sameAs"]') or []
for elem in elems:
if not elem.attributes.get('href'):
continue
url = elem.attributes['href']
if "://portal.issn.org/" in url:
issn = url.split('/')[-1].strip()
if len(issn) == 9:
meta['issns'].append(issn)
else:
print(issn, file=sys.stderr)
elif "://www.wikidata.org/entity/Q" in url:
meta['wikidata_qid'] = url.split('/')[-1]
assert 'Q' in meta['wikidata_qid']
# <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
elem = doc.css_first('header#headline a[itemprop="url"]')
if elem and elem.attributes.get('href'):
meta['homepage_url'] = elem.attributes['href']
return meta
def run() -> None:
for path in sys.stdin:
path = path.strip()
if not path:
continue
if not path.endswith(".html"):
path += ".html"
obj = parse_html(path)
if obj:
print(json.dumps(obj, sort_keys=True))
if __name__=='__main__':
run()
|