aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:29:37 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:29:37 -0700
commit519733b77832ccbf97491a794e7f10884e39acdb (patch)
tree9fef34e691b4370e372cdf9d01457017835defdf
parent94cce3ebe325f7601feee7dbe7ab8b24aa2492ee (diff)
downloadfatcat-519733b77832ccbf97491a794e7f10884e39acdb.tar.gz
fatcat-519733b77832ccbf97491a794e7f10884e39acdb.zip
dblp: updated ingest pipeline
-rw-r--r--extra/dblp/.gitignore6
-rw-r--r--extra/dblp/Pipfile1
-rw-r--r--extra/dblp/README.md51
-rwxr-xr-xextra/dblp/cleanup.sh17
-rwxr-xr-xextra/dblp/dblp2ingestrequest.py97
-rwxr-xr-xextra/dblp/prep_metadata.sh48
6 files changed, 213 insertions, 7 deletions
diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore
index a04dd76e..60774a12 100644
--- a/extra/dblp/.gitignore
+++ b/extra/dblp/.gitignore
@@ -4,3 +4,9 @@ series/
Pipfile.lock
*.json
*.html
+*.txt
+*.dtd
+*.xml
+*.xml.gz
+*.tsv
+*.json.gz
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
index dbf86ac0..69705a3a 100644
--- a/extra/dblp/Pipfile
+++ b/extra/dblp/Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
[packages]
selectolax = "*"
+urlcanon = "*"
[dev-packages]
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
index e6ccce4f..a95f7214 100644
--- a/extra/dblp/README.md
+++ b/extra/dblp/README.md
@@ -1,14 +1,51 @@
-This file describes hacks used to import dblp container metadata.
+This file describes hacks used to import dblp container and release metadata.
-As of December 2020 this is part of the dblp release metadata import pipeline:
-we must have conference and other non-ISSN containers created before running
-the release import. dblp does not publish container-level metadata in a
-structured format (eg, in their dumps), so scraping the HTML is unfortunately
-necessary.
+The container metadata must be processed and imported first, to create
+containers for non-ISSN venues. However, dblp only publishes structured
+metadata for articles (releases), not venues (containers), so we need to
+process the articles, then import the containers, then import the articles.
+There is a path that scrapes venue metadata out of dblp.org HTML.
-## Quick Bootstrap Commands
+
+## New Process (2022)
+
+Usually all of this gets run on a production fatcat instance. It may be
+possible to run parts elsewhere, but not confirmed, and would require copying
+some set of files around.
+
+ # remove any old/stale files
+ ./cleanup.sh
+
+ ./prep_container_metadata.sh
+
+This will take a while to run, after which the container metadata can be
+imported, like:
+
+ cd ../../python
+ pipenv shell
+ export FATCAT_AUTH_WORKER_DBLP=[...]
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json
+
+Check that counts look sane:
+
+ wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt
+
+Then do release import like:
+
+ cd ../../python
+ pipenv shell
+ export FATCAT_AUTH_WORKER_DBLP=[...]
+ ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml
+
+Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial
+release objects::
+
+ cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
+
+
+## [OLD] Manual Commands
Set up a working directory somewhere:
diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh
new file mode 100755
index 00000000..52e1a2ea
--- /dev/null
+++ b/extra/dblp/cleanup.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+
+rm -f dblp.dtd
+rm -f dblp.xml.gz
+rm -f dblp.xml
+rm -f dblp_releases_partial.json
+rm -f prefix_list.txt
+rm -f dblp_container_meta.json
+rm -f existing_dblp_containers.tsv
+rm -f all_dblp_containers.tsv
+
+rm -rf ./journals/
+rm -rf ./conf/
+rm -rf ./series/
+
diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py
new file mode 100755
index 00000000..bdf5575d
--- /dev/null
+++ b/extra/dblp/dblp2ingestrequest.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Transform a transformed, fatcat-like dblp object (JSON) into zero or more
+sandcrawler ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+ # we crawl some of these directly via extid; others are just catalogs
+ "://arxiv.org/",
+ "://europepmc.org/",
+ #"://hdl.handle.net/",
+ "ncbi.nlm.nih.gov/",
+ "://doi.org/",
+ "zenodo.org/",
+ "figshare.com/",
+ "://d-nb.info/",
+ "://www.base-search.net/",
+]
+
+
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+ """
+ Transforms from a single object to zero or more ingest requests.
+ Returns a list of dicts.
+ """
+
+ requests = []
+ if not obj["ext_ids"].get("dblp"):
+ return requests
+ if not obj.get("_dblp_ee_urls"):
+ return requests
+
+ for url in obj["_dblp_ee_urls"]:
+ skip = False
+ for domain in DOMAIN_BLOCKLIST:
+ if domain in url:
+ skip = True
+ if skip:
+ continue
+ try:
+ base_url = canon(url)
+ except UnicodeEncodeError:
+ continue
+
+ request = {
+ "base_url": base_url,
+ "ingest_type": "pdf",
+ "link_source": "dblp",
+ "link_source_id": obj["ext_ids"]["dblp"],
+ "ingest_request_source": "dblp",
+ "release_stage": obj.get("release_stage") or None,
+ "ext_ids": {
+ "dblp": obj["ext_ids"]["dblp"],
+ },
+ "edit_extra": {},
+ }
+ requests.append(request)
+
+ return requests
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+
+ requests = transform(row) or []
+ for r in requests:
+ print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r")
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh
new file mode 100755
index 00000000..21a50ab0
--- /dev/null
+++ b/extra/dblp/prep_metadata.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"
+
+set -e -u -o pipefail
+
+# ensure deps
+#alias fd=fdfind
+fd -h > /dev/null
+fatcat-cli -h > /dev/null
+pipenv -h > /dev/null
+
+# ensure pipenv is ready
+pipenv install
+pipenv run true
+
+
+wget -c 'https://dblp.org/xml/dblp.dtd'
+wget -c 'https://dblp.org/xml/dblp.xml.gz'
+
+zcat dblp.xml.gz > dblp.xml
+
+cd ../../python
+pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json
+
+cd ../extra/dblp/
+
+cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt
+
+mkdir -p journals
+mkdir -p conf
+mkdir -p series
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+# clean up any failed/empty files, then re-run the above parallel/wget command
+find . -empty -type f -delete
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+find . -empty -type f -delete
+
+fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+
+cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz