From 8c2394a5e0ae73c5d534bed30e339ab5004d11e1 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 13:55:57 -0800 Subject: dblp: script and notes on container metadata generation --- extra/dblp/.gitignore | 3 ++ extra/dblp/Pipfile | 11 ++++++ extra/dblp/README.md | 34 ++++++++++++++++ extra/dblp/dblp_html_extract.py | 86 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+) create mode 100644 extra/dblp/.gitignore create mode 100644 extra/dblp/Pipfile create mode 100644 extra/dblp/README.md create mode 100755 extra/dblp/dblp_html_extract.py (limited to 'extra/dblp') diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore new file mode 100644 index 00000000..8847a157 --- /dev/null +++ b/extra/dblp/.gitignore @@ -0,0 +1,3 @@ +conf/ +journals/ +series/ diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile new file mode 100644 index 00000000..b9ba84f6 --- /dev/null +++ b/extra/dblp/Pipfile @@ -0,0 +1,11 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] + +[dev-packages] + +[requires] +python_version = "3.7" diff --git a/extra/dblp/README.md b/extra/dblp/README.md new file mode 100644 index 00000000..d74f8bf9 --- /dev/null +++ b/extra/dblp/README.md @@ -0,0 +1,34 @@ + +This file describes hacks used to import dblp container metadata. + + +## Quick Bootstrap Commands + +Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform +and dump release entities in JSON; this takes some time: + + ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json + +Next extract the unique set of dblp identifier prefixes, which will be used as +container identifiers: + + cat /data/dblp/dblp_releases.json | jq ._dblp_prefix | grep -v ^none | sort -u > /data/dblp/prefix_list.txt + +Then fetch HTML documents from dblp.org for each prefix: + + mkdir -p journals + mkdir -p conf + mkdir -p series + + shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + + # clean up any failed/empty files, then re-run the above parallel/wget command + find . -empty -type f -delete + +Using the python script in this directory, extract metadata from these HTML documents: + + fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +This can be imported into fatcat using the dblp-container importer: + + ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json diff --git a/extra/dblp/dblp_html_extract.py b/extra/dblp/dblp_html_extract.py new file mode 100755 index 00000000..369eac1a --- /dev/null +++ b/extra/dblp/dblp_html_extract.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 + +""" +Run this script and pass a list of filenames (with or without .html) to stdin, +and this will output JSON objects to stdout. Eg: + + fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +Requires virtualenv with selectolax. +""" + +import sys +import json + +from selectolax.parser import HTMLParser + + +def parse_html(path: str) -> dict: + """ + Parses from HTML: + + - key + - title + - issns (list) + - wikidata_qid + - homepage_url + - acronym (?) + + TODO: publisher? + """ + key = path.replace('.html', '') + if not len(key.split('/')) == 2: + print(key, file=sys.stderr) + return {} + meta = dict(dblp_prefix=key, issns=[]) + + try: + with open(path, 'r') as html_file: + doc = HTMLParser(html_file.read()) + except FileNotFoundError: + return {} + + elem = doc.css_first('header#headline h1') + if elem and elem.text(): + meta['title'] = elem.text() + if meta['title'].endswith(')') and meta['title'].count('(') == 1: + meta['acronym'] = meta['title'].split('(')[-1][:-1] + meta['title'] = meta['title'].split('(')[0].strip() + + # + # + elems = doc.css('header#headline a[itemprop="sameAs"]') or [] + for elem in elems: + if not elem.attributes.get('href'): + continue + url = elem.attributes['href'] + if "://portal.issn.org/" in url: + issn = url.split('/')[-1].strip() + if len(issn) == 9: + meta['issns'].append(issn) + else: + print(issn, file=sys.stderr) + elif "://www.wikidata.org/entity/Q" in url: + meta['wikidata_qid'] = url.split('/')[-1] + assert 'Q' in meta['wikidata_qid'] + + # + elem = doc.css_first('header#headline a[itemprop="url"]') + if elem and elem.attributes.get('href'): + meta['homepage_url'] = elem.attributes['href'] + + return meta + +def run() -> None: + for path in sys.stdin: + path = path.strip() + if not path: + continue + if not path.endswith(".html"): + path += ".html" + obj = parse_html(path) + if obj: + print(json.dumps(obj, sort_keys=True)) + +if __name__=='__main__': + run() -- cgit v1.2.3