From 519733b77832ccbf97491a794e7f10884e39acdb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Jul 2022 12:29:37 -0700 Subject: dblp: updated ingest pipeline --- extra/dblp/.gitignore | 6 +++ extra/dblp/Pipfile | 1 + extra/dblp/README.md | 51 ++++++++++++++++++--- extra/dblp/cleanup.sh | 17 +++++++ extra/dblp/dblp2ingestrequest.py | 97 ++++++++++++++++++++++++++++++++++++++++ extra/dblp/prep_metadata.sh | 48 ++++++++++++++++++++ 6 files changed, 213 insertions(+), 7 deletions(-) create mode 100755 extra/dblp/cleanup.sh create mode 100755 extra/dblp/dblp2ingestrequest.py create mode 100755 extra/dblp/prep_metadata.sh (limited to 'extra') diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore index a04dd76e..60774a12 100644 --- a/extra/dblp/.gitignore +++ b/extra/dblp/.gitignore @@ -4,3 +4,9 @@ series/ Pipfile.lock *.json *.html +*.txt +*.dtd +*.xml +*.xml.gz +*.tsv +*.json.gz diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile index dbf86ac0..69705a3a 100644 --- a/extra/dblp/Pipfile +++ b/extra/dblp/Pipfile @@ -5,6 +5,7 @@ name = "pypi" [packages] selectolax = "*" +urlcanon = "*" [dev-packages] diff --git a/extra/dblp/README.md b/extra/dblp/README.md index e6ccce4f..a95f7214 100644 --- a/extra/dblp/README.md +++ b/extra/dblp/README.md @@ -1,14 +1,51 @@ -This file describes hacks used to import dblp container metadata. +This file describes hacks used to import dblp container and release metadata. -As of December 2020 this is part of the dblp release metadata import pipeline: -we must have conference and other non-ISSN containers created before running -the release import. dblp does not publish container-level metadata in a -structured format (eg, in their dumps), so scraping the HTML is unfortunately -necessary. +The container metadata must be processed and imported first, to create +containers for non-ISSN venues. However, dblp only publishes structured +metadata for articles (releases), not venues (containers), so we need to +process the articles, then import the containers, then import the articles. +There is a path that scrapes venue metadata out of dblp.org HTML. -## Quick Bootstrap Commands + +## New Process (2022) + +Usually all of this gets run on a production fatcat instance. It may be +possible to run parts elsewhere, but not confirmed, and would require copying +some set of files around. + + # remove any old/stale files + ./cleanup.sh + + ./prep_container_metadata.sh + +This will take a while to run, after which the container metadata can be +imported, like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json + +Check that counts look sane: + + wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt + +Then do release import like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml + +Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial +release objects:: + + cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz + + +## [OLD] Manual Commands Set up a working directory somewhere: diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh new file mode 100755 index 00000000..52e1a2ea --- /dev/null +++ b/extra/dblp/cleanup.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine + +rm -f dblp.dtd +rm -f dblp.xml.gz +rm -f dblp.xml +rm -f dblp_releases_partial.json +rm -f prefix_list.txt +rm -f dblp_container_meta.json +rm -f existing_dblp_containers.tsv +rm -f all_dblp_containers.tsv + +rm -rf ./journals/ +rm -rf ./conf/ +rm -rf ./series/ + diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py new file mode 100755 index 00000000..bdf5575d --- /dev/null +++ b/extra/dblp/dblp2ingestrequest.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Transform a transformed, fatcat-like dblp object (JSON) into zero or more +sandcrawler ingest requests. +""" + +import argparse +import json +import sys + +import urlcanon + +DOMAIN_BLOCKLIST = [ + # we crawl some of these directly via extid; others are just catalogs + "://arxiv.org/", + "://europepmc.org/", + #"://hdl.handle.net/", + "ncbi.nlm.nih.gov/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://d-nb.info/", + "://www.base-search.net/", +] + + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform(obj): + """ + Transforms from a single object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj["ext_ids"].get("dblp"): + return requests + if not obj.get("_dblp_ee_urls"): + return requests + + for url in obj["_dblp_ee_urls"]: + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in url: + skip = True + if skip: + continue + try: + base_url = canon(url) + except UnicodeEncodeError: + continue + + request = { + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "dblp", + "link_source_id": obj["ext_ids"]["dblp"], + "ingest_request_source": "dblp", + "release_stage": obj.get("release_stage") or None, + "ext_ids": { + "dblp": obj["ext_ids"]["dblp"], + }, + "edit_extra": {}, + } + requests.append(request) + + return requests + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r") + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh new file mode 100755 index 00000000..21a50ab0 --- /dev/null +++ b/extra/dblp/prep_metadata.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine +#export FATCAT_API_HOST="https://api.fatcat.wiki/v0" + +set -e -u -o pipefail + +# ensure deps +#alias fd=fdfind +fd -h > /dev/null +fatcat-cli -h > /dev/null +pipenv -h > /dev/null + +# ensure pipenv is ready +pipenv install +pipenv run true + + +wget -c 'https://dblp.org/xml/dblp.dtd' +wget -c 'https://dblp.org/xml/dblp.xml.gz' + +zcat dblp.xml.gz > dblp.xml + +cd ../../python +pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json + +cd ../extra/dblp/ + +cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt + +mkdir -p journals +mkdir -p conf +mkdir -p series + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +# clean up any failed/empty files, then re-run the above parallel/wget command +find . -empty -type f -delete + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +find . -empty -type f -delete + +fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv + +cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz -- cgit v1.2.3