From 3031aa414932b39f38a6456df2a6f55f0e72dfbe Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 17 Dec 2020 22:41:14 -0800 Subject: dblp: polish HTML scrape/extract pipeline --- extra/dblp/.gitignore | 3 +++ extra/dblp/Pipfile | 1 + extra/dblp/README.md | 15 ++++++++++++--- python/fatcat_tools/importers/dblp_release.py | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore index 8847a157..a04dd76e 100644 --- a/extra/dblp/.gitignore +++ b/extra/dblp/.gitignore @@ -1,3 +1,6 @@ conf/ journals/ series/ +Pipfile.lock +*.json +*.html diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile index b9ba84f6..a191e76f 100644 --- a/extra/dblp/Pipfile +++ b/extra/dblp/Pipfile @@ -4,6 +4,7 @@ verify_ssl = true name = "pypi" [packages] +selectolax = "*" [dev-packages] diff --git a/extra/dblp/README.md b/extra/dblp/README.md index d74f8bf9..f2fd02da 100644 --- a/extra/dblp/README.md +++ b/extra/dblp/README.md @@ -1,6 +1,12 @@ This file describes hacks used to import dblp container metadata. +As of December 2020 this is part of the dblp release metadata import pipeline: +we must have conference and other non-ISSN containers created before running +the release import. dblp does not publish container-level metadata in a +structured format (eg, in their dumps), so scraping the HTML is unfortunately +necessary. + ## Quick Bootstrap Commands @@ -12,9 +18,12 @@ and dump release entities in JSON; this takes some time: Next extract the unique set of dblp identifier prefixes, which will be used as container identifiers: - cat /data/dblp/dblp_releases.json | jq ._dblp_prefix | grep -v ^none | sort -u > /data/dblp/prefix_list.txt + cat /data/dblp/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > /data/dblp/prefix_list.txt -Then fetch HTML documents from dblp.org for each prefix: +Then fetch HTML documents from dblp.org for each prefix. Note that currently +only single-level containers will download successfully, and only journals, +conf, and series sections. Books, Tech Reports, etc may be nice to include in +the future. mkdir -p journals mkdir -p conf @@ -27,7 +36,7 @@ Then fetch HTML documents from dblp.org for each prefix: Using the python script in this directory, extract metadata from these HTML documents: - fd .html | ./dblp_html_extract.py | pv -l > dblp_container_meta.json + fd html conf/ journals/ series/ | ./dblp_html_extract.py | pv -l > dblp_container_meta.json This can be imported into fatcat using the dblp-container importer: diff --git a/python/fatcat_tools/importers/dblp_release.py b/python/fatcat_tools/importers/dblp_release.py index 9cebcdc2..5cbc95d0 100644 --- a/python/fatcat_tools/importers/dblp_release.py +++ b/python/fatcat_tools/importers/dblp_release.py @@ -5,6 +5,20 @@ Importer for DBLP release-level (article/paper/etc) XML metadata. Works similarly to PubMed XML importer: expects to have a large XML file iterated over quickly, with individual elements re-parsed into smaller objects and passed to `parse_record()`. + +There are two valuable pieces of relationship metadata in dblp: + +- container linkages, especially to conferences which do not have ISSNs, and + thus no existing fatcat containers +- author disambiguation, which is a work in progress but anecdotally higher + quality than MAG, Semantic Scholar, etc + +We are not going to do author (creator) ingest at this time. For containers, +import would be made much easier if we updated the database schema to include +dblp_prefix as a lookup key, but this is more difficult for containers than +with releases, so we are going to skip it for now. This leaves us with a +brittle/unreliable TSV lookup mechanism for prefix-to-container_id (as of +December 2020). """ import sys # noqa: F401 -- cgit v1.2.3