dblp: updated ingest pipeline

author: Bryan Newbold <bnewbold@robocracy.org> 2022-07-19 12:29:37 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2022-07-19 12:29:37 -0700
commit: 519733b77832ccbf97491a794e7f10884e39acdb (patch)
tree: 9fef34e691b4370e372cdf9d01457017835defdf
parent: 94cce3ebe325f7601feee7dbe7ab8b24aa2492ee (diff)
download: fatcat-519733b77832ccbf97491a794e7f10884e39acdb.tar.gz
fatcat-519733b77832ccbf97491a794e7f10884e39acdb.zip
6 files changed, 213 insertions, 7 deletions
diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore
index a04dd76e..60774a12 100644
--- a/extra/dblp/.gitignore
+++ b/extra/dblp/.gitignore
@@ -4,3 +4,9 @@ series/
 Pipfile.lock
 *.json
 *.html
+*.txt
+*.dtd
+*.xml
+*.xml.gz
+*.tsv
+*.json.gz
diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile
index dbf86ac0..69705a3a 100644
--- a/extra/dblp/Pipfile
+++ b/extra/dblp/Pipfile
@@ -5,6 +5,7 @@ name = "pypi"
 
 [packages]
 selectolax = "*"
+urlcanon = "*"
 
 [dev-packages]
 
diff --git a/extra/dblp/README.md b/extra/dblp/README.md
index e6ccce4f..a95f7214 100644
--- a/extra/dblp/README.md
+++ b/extra/dblp/README.md
@@ -1,14 +1,51 @@
 
-This file describes hacks used to import dblp container metadata.
+This file describes hacks used to import dblp container and release metadata.
 
-As of December 2020 this is part of the dblp release metadata import pipeline:
-we must have conference and other non-ISSN containers created before running
-the release import. dblp does not publish container-level metadata in a
-structured format (eg, in their dumps), so scraping the HTML is unfortunately
-necessary.
+The container metadata must be processed and imported first, to create
+containers for non-ISSN venues. However, dblp only publishes structured
+metadata for articles (releases), not venues (containers), so we need to
+process the articles, then import the containers, then import the articles.
 
+There is a path that scrapes venue metadata out of dblp.org HTML.
 
-## Quick Bootstrap Commands
+
+## New Process (2022)
+
+Usually all of this gets run on a production fatcat instance. It may be
+possible to run parts elsewhere, but not confirmed, and would require copying
+some set of files around.
+
+    # remove any old/stale files
+    ./cleanup.sh
+
+    ./prep_container_metadata.sh
+
+This will take a while to run, after which the container metadata can be
+imported, like:
+
+    cd ../../python
+    pipenv shell
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json
+
+Check that counts look sane:
+
+    wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt
+
+Then do release import like:
+
+    cd ../../python
+    pipenv shell
+    export FATCAT_AUTH_WORKER_DBLP=[...]
+    ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml
+
+Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial
+release objects::
+
+    cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
+
+
+## [OLD] Manual Commands
 
 Set up a working directory somewhere:
 
diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh
new file mode 100755
index 00000000..52e1a2ea
--- /dev/null
+++ b/extra/dblp/cleanup.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+
+rm -f dblp.dtd
+rm -f dblp.xml.gz
+rm -f dblp.xml
+rm -f dblp_releases_partial.json
+rm -f prefix_list.txt
+rm -f dblp_container_meta.json
+rm -f existing_dblp_containers.tsv
+rm -f all_dblp_containers.tsv
+
+rm -rf ./journals/
+rm -rf ./conf/
+rm -rf ./series/
+
diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py
new file mode 100755
index 00000000..bdf5575d
--- /dev/null
+++ b/extra/dblp/dblp2ingestrequest.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Transform a transformed, fatcat-like dblp object (JSON) into zero or more
+sandcrawler ingest requests.
+"""
+
+import argparse
+import json
+import sys
+
+import urlcanon
+
+DOMAIN_BLOCKLIST = [
+    # we crawl some of these directly via extid; others are just catalogs
+    "://arxiv.org/",
+    "://europepmc.org/",
+    #"://hdl.handle.net/",
+    "ncbi.nlm.nih.gov/",
+    "://doi.org/",
+    "zenodo.org/",
+    "figshare.com/",
+    "://d-nb.info/",
+    "://www.base-search.net/",
+]
+
+
+def canon(s):
+    parsed = urlcanon.parse_url(s)
+    return str(urlcanon.whatwg(parsed))
+
+
+def transform(obj):
+    """
+    Transforms from a single object to zero or more ingest requests.
+    Returns a list of dicts.
+    """
+
+    requests = []
+    if not obj["ext_ids"].get("dblp"):
+        return requests
+    if not obj.get("_dblp_ee_urls"):
+        return requests
+
+    for url in obj["_dblp_ee_urls"]:
+        skip = False
+        for domain in DOMAIN_BLOCKLIST:
+            if domain in url:
+                skip = True
+        if skip:
+            continue
+        try:
+            base_url = canon(url)
+        except UnicodeEncodeError:
+            continue
+
+        request = {
+            "base_url": base_url,
+            "ingest_type": "pdf",
+            "link_source": "dblp",
+            "link_source_id": obj["ext_ids"]["dblp"],
+            "ingest_request_source": "dblp",
+            "release_stage": obj.get("release_stage") or None,
+            "ext_ids": {
+                "dblp": obj["ext_ids"]["dblp"],
+            },
+            "edit_extra": {},
+        }
+        requests.append(request)
+
+    return requests
+
+
+def run(args):
+    for l in args.json_file:
+        if not l.strip():
+            continue
+        row = json.loads(l)
+
+        requests = transform(row) or []
+        for r in requests:
+            print("{}".format(json.dumps(r, sort_keys=True)))
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r")
+    )
+    subparsers = parser.add_subparsers()
+
+    args = parser.parse_args()
+
+    run(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh
new file mode 100755
index 00000000..21a50ab0
--- /dev/null
+++ b/extra/dblp/prep_metadata.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# run this as 'fatcat' user on a production machine
+#export FATCAT_API_HOST="https://api.fatcat.wiki/v0"
+
+set -e -u -o pipefail
+
+# ensure deps
+#alias fd=fdfind
+fd -h > /dev/null
+fatcat-cli -h > /dev/null
+pipenv -h > /dev/null
+
+# ensure pipenv is ready
+pipenv install
+pipenv run true
+
+
+wget -c 'https://dblp.org/xml/dblp.dtd'
+wget -c 'https://dblp.org/xml/dblp.xml.gz'
+
+zcat dblp.xml.gz > dblp.xml
+
+cd ../../python
+pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json
+
+cd ../extra/dblp/
+
+cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt
+
+mkdir -p journals
+mkdir -p conf
+mkdir -p series
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+# clean up any failed/empty files, then re-run the above parallel/wget command
+find . -empty -type f -delete
+
+shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+find . -empty -type f -delete
+
+fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json
+
+fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+
+cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
author	Bryan Newbold <bnewbold@robocracy.org>	2022-07-19 12:29:37 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2022-07-19 12:29:37 -0700
commit	519733b77832ccbf97491a794e7f10884e39acdb (patch)
tree	9fef34e691b4370e372cdf9d01457017835defdf
parent	94cce3ebe325f7601feee7dbe7ab8b24aa2492ee (diff)
download	fatcat-519733b77832ccbf97491a794e7f10884e39acdb.tar.gz fatcat-519733b77832ccbf97491a794e7f10884e39acdb.zip