summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-17 13:55:57 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-17 23:03:08 -0800
commit8c2394a5e0ae73c5d534bed30e339ab5004d11e1 (patch)
tree8e3f1564529abb5f3a3fd5890f1e222d787b581f
parent9451b3063c2d446748db74027c40c13ee69c24fb (diff)
downloadfatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.tar.gz
fatcat-8c2394a5e0ae73c5d534bed30e339ab5004d11e1.zip
dblp: script and notes on container metadata generation
-rw-r--r--extra/dblp/.gitignore3
-rw-r--r--extra/dblp/Pipfile11
-rw-r--r--extra/dblp/README.md34
-rwxr-xr-xextra/dblp/dblp_html_extract.py86
4 files changed, 134 insertions, 0 deletions
diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore
new file mode 100644
index 00000000..8847a157
--- /dev/null
+++ b/extra/dblp/.gitignore
@@ -0,0 +1,3 @@
+conf/
+journals/
+series/
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
new file mode 100644
index 00000000..b9ba84f6
--- /dev/null
+++ b/extra/dblp/Pipfile
@@ -0,0 +1,11 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+
+[dev-packages]
+
+[requires]
+python_version = "3.7"
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
new file mode 100644
index 00000000..d74f8bf9
--- /dev/null
+++ b/extra/dblp/README.md
@@ -0,0 +1,34 @@
+
+This file describes hacks used to import dblp container metadata.
+
+
+## Quick Bootstrap Commands
+
+Starting with a complete dblp.xml (and dblp.dtd) dump, do a dry-run transform
+and dump release entities in JSON; this takes some time:
+
+ ./fatcat_import.py dblp-release /data/dblp/dblp.xml --dump-json-mode > /data/dblp/dblp_releases.json
+
+Next extract the unique set of dblp identifier prefixes, which will be used as
+container identifiers:
+
+ cat /data/dblp/dblp_releases.json | jq ._dblp_prefix | grep -v ^none | sort -u > /data/dblp/prefix_list.txt
+
+Then fetch HTML documents from dblp.org for each prefix:
+
+ mkdir -p journals
+ mkdir -p conf
+ mkdir -p series
+
+ shuf /data/dblp/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+ # clean up any failed/empty files, then re-run the above parallel/wget command
+ find . -empty -type f -delete
+
+Using the python script in this directory, extract metadata from these HTML documents:
+
+ fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+This can be imported into fatcat using the dblp-container importer:
+
+ ./fatcat_import.py dblp-container --issn-map-file /data/issn/20200323.ISSN-to-ISSN-L.txt --dblp-container-map-file /data/dblp/existing_dblp_containers.tsv --dblp-container-map-output /data/dblp/all_dblp_containers.tsv dblp_container_meta.json
diff --git a/extra/dblp/dblp_html_extract.py b/extra/dblp/dblp_html_extract.py
new file mode 100755
index 00000000..369eac1a
--- /dev/null
+++ b/extra/dblp/dblp_html_extract.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+"""
+Run this script and pass a list of filenames (with or without .html) to stdin,
+and this will output JSON objects to stdout. Eg:
+
+ fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+Requires virtualenv with selectolax.
+"""
+
+import sys
+import json
+
+from selectolax.parser import HTMLParser
+
+
+def parse_html(path: str) -> dict:
+ """
+ Parses from HTML:
+
+ - key
+ - title
+ - issns (list)
+ - wikidata_qid
+ - homepage_url
+ - acronym (?)
+
+ TODO: publisher?
+ """
+ key = path.replace('.html', '')
+ if not len(key.split('/')) == 2:
+ print(key, file=sys.stderr)
+ return {}
+ meta = dict(dblp_prefix=key, issns=[])
+
+ try:
+ with open(path, 'r') as html_file:
+ doc = HTMLParser(html_file.read())
+ except FileNotFoundError:
+ return {}
+
+ elem = doc.css_first('header#headline h1')
+ if elem and elem.text():
+ meta['title'] = elem.text()
+ if meta['title'].endswith(')') and meta['title'].count('(') == 1:
+ meta['acronym'] = meta['title'].split('(')[-1][:-1]
+ meta['title'] = meta['title'].split('(')[0].strip()
+
+ # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs">
+ # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs">
+ elems = doc.css('header#headline a[itemprop="sameAs"]') or []
+ for elem in elems:
+ if not elem.attributes.get('href'):
+ continue
+ url = elem.attributes['href']
+ if "://portal.issn.org/" in url:
+ issn = url.split('/')[-1].strip()
+ if len(issn) == 9:
+ meta['issns'].append(issn)
+ else:
+ print(issn, file=sys.stderr)
+ elif "://www.wikidata.org/entity/Q" in url:
+ meta['wikidata_qid'] = url.split('/')[-1]
+ assert 'Q' in meta['wikidata_qid']
+
+ # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a>
+ elem = doc.css_first('header#headline a[itemprop="url"]')
+ if elem and elem.attributes.get('href'):
+ meta['homepage_url'] = elem.attributes['href']
+
+ return meta
+
+def run() -> None:
+ for path in sys.stdin:
+ path = path.strip()
+ if not path:
+ continue
+ if not path.endswith(".html"):
+ path += ".html"
+ obj = parse_html(path)
+ if obj:
+ print(json.dumps(obj, sort_keys=True))
+
+if __name__=='__main__':
+ run()