From 519733b77832ccbf97491a794e7f10884e39acdb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Jul 2022 12:29:37 -0700 Subject: dblp: updated ingest pipeline --- extra/dblp/.gitignore | 6 +++ extra/dblp/Pipfile | 1 + extra/dblp/README.md | 51 ++++++++++++++++++--- extra/dblp/cleanup.sh | 17 +++++++ extra/dblp/dblp2ingestrequest.py | 97 ++++++++++++++++++++++++++++++++++++++++ extra/dblp/prep_metadata.sh | 48 ++++++++++++++++++++ 6 files changed, 213 insertions(+), 7 deletions(-) create mode 100755 extra/dblp/cleanup.sh create mode 100755 extra/dblp/dblp2ingestrequest.py create mode 100755 extra/dblp/prep_metadata.sh (limited to 'extra') diff --git a/extra/dblp/.gitignore b/extra/dblp/.gitignore index a04dd76e..60774a12 100644 --- a/extra/dblp/.gitignore +++ b/extra/dblp/.gitignore @@ -4,3 +4,9 @@ series/ Pipfile.lock *.json *.html +*.txt +*.dtd +*.xml +*.xml.gz +*.tsv +*.json.gz diff --git a/extra/dblp/Pipfile b/extra/dblp/Pipfile index dbf86ac0..69705a3a 100644 --- a/extra/dblp/Pipfile +++ b/extra/dblp/Pipfile @@ -5,6 +5,7 @@ name = "pypi" [packages] selectolax = "*" +urlcanon = "*" [dev-packages] diff --git a/extra/dblp/README.md b/extra/dblp/README.md index e6ccce4f..a95f7214 100644 --- a/extra/dblp/README.md +++ b/extra/dblp/README.md @@ -1,14 +1,51 @@ -This file describes hacks used to import dblp container metadata. +This file describes hacks used to import dblp container and release metadata. -As of December 2020 this is part of the dblp release metadata import pipeline: -we must have conference and other non-ISSN containers created before running -the release import. dblp does not publish container-level metadata in a -structured format (eg, in their dumps), so scraping the HTML is unfortunately -necessary. +The container metadata must be processed and imported first, to create +containers for non-ISSN venues. However, dblp only publishes structured +metadata for articles (releases), not venues (containers), so we need to +process the articles, then import the containers, then import the articles. +There is a path that scrapes venue metadata out of dblp.org HTML. -## Quick Bootstrap Commands + +## New Process (2022) + +Usually all of this gets run on a production fatcat instance. It may be +possible to run parts elsewhere, but not confirmed, and would require copying +some set of files around. + + # remove any old/stale files + ./cleanup.sh + + ./prep_container_metadata.sh + +This will take a while to run, after which the container metadata can be +imported, like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json + +Check that counts look sane: + + wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt + +Then do release import like: + + cd ../../python + pipenv shell + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml + +Lastly, to generate sandcrawler ingest requests, from the JSON-dumped partial +release objects:: + + cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz + + +## [OLD] Manual Commands Set up a working directory somewhere: diff --git a/extra/dblp/cleanup.sh b/extra/dblp/cleanup.sh new file mode 100755 index 00000000..52e1a2ea --- /dev/null +++ b/extra/dblp/cleanup.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine + +rm -f dblp.dtd +rm -f dblp.xml.gz +rm -f dblp.xml +rm -f dblp_releases_partial.json +rm -f prefix_list.txt +rm -f dblp_container_meta.json +rm -f existing_dblp_containers.tsv +rm -f all_dblp_containers.tsv + +rm -rf ./journals/ +rm -rf ./conf/ +rm -rf ./series/ + diff --git a/extra/dblp/dblp2ingestrequest.py b/extra/dblp/dblp2ingestrequest.py new file mode 100755 index 00000000..bdf5575d --- /dev/null +++ b/extra/dblp/dblp2ingestrequest.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Transform a transformed, fatcat-like dblp object (JSON) into zero or more +sandcrawler ingest requests. +""" + +import argparse +import json +import sys + +import urlcanon + +DOMAIN_BLOCKLIST = [ + # we crawl some of these directly via extid; others are just catalogs + "://arxiv.org/", + "://europepmc.org/", + #"://hdl.handle.net/", + "ncbi.nlm.nih.gov/", + "://doi.org/", + "zenodo.org/", + "figshare.com/", + "://d-nb.info/", + "://www.base-search.net/", +] + + +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) + + +def transform(obj): + """ + Transforms from a single object to zero or more ingest requests. + Returns a list of dicts. + """ + + requests = [] + if not obj["ext_ids"].get("dblp"): + return requests + if not obj.get("_dblp_ee_urls"): + return requests + + for url in obj["_dblp_ee_urls"]: + skip = False + for domain in DOMAIN_BLOCKLIST: + if domain in url: + skip = True + if skip: + continue + try: + base_url = canon(url) + except UnicodeEncodeError: + continue + + request = { + "base_url": base_url, + "ingest_type": "pdf", + "link_source": "dblp", + "link_source_id": obj["ext_ids"]["dblp"], + "ingest_request_source": "dblp", + "release_stage": obj.get("release_stage") or None, + "ext_ids": { + "dblp": obj["ext_ids"]["dblp"], + }, + "edit_extra": {}, + } + requests.append(request) + + return requests + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + + requests = transform(row) or [] + for r in requests: + print("{}".format(json.dumps(r, sort_keys=True))) + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "json_file", help="dblp transformed JSON file to use", type=argparse.FileType("r") + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() diff --git a/extra/dblp/prep_metadata.sh b/extra/dblp/prep_metadata.sh new file mode 100755 index 00000000..21a50ab0 --- /dev/null +++ b/extra/dblp/prep_metadata.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# run this as 'fatcat' user on a production machine +#export FATCAT_API_HOST="https://api.fatcat.wiki/v0" + +set -e -u -o pipefail + +# ensure deps +#alias fd=fdfind +fd -h > /dev/null +fatcat-cli -h > /dev/null +pipenv -h > /dev/null + +# ensure pipenv is ready +pipenv install +pipenv run true + + +wget -c 'https://dblp.org/xml/dblp.dtd' +wget -c 'https://dblp.org/xml/dblp.xml.gz' + +zcat dblp.xml.gz > dblp.xml + +cd ../../python +pipenv run ./fatcat_import.py dblp-release ../extra/dblp/dblp.xml --dump-json-mode | pv -l > ../extra/dblp/dblp_releases_partial.json + +cd ../extra/dblp/ + +cat dblp_releases_partial.json | jq ._dblp_prefix -r | grep -v ^null | rg '^(journals|conf|series)' | sort -u > prefix_list.txt + +mkdir -p journals +mkdir -p conf +mkdir -p series + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +# clean up any failed/empty files, then re-run the above parallel/wget command +find . -empty -type f -delete + +shuf prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html + +find . -empty -type f -delete + +fd -I html conf/ journals/ series/ | pipenv run ./dblp_html_extract.py | pv -l > dblp_container_meta.json + +fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv + +cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz -- cgit v1.2.3 From b12d4f0bde96bfe39df1cc94a993da4b25e53304 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 19 Jul 2022 12:48:15 -0700 Subject: dblp import/update notes --- extra/bulk_edits/2022-07-13_dblp.md | 114 ++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 extra/bulk_edits/2022-07-13_dblp.md (limited to 'extra') diff --git a/extra/bulk_edits/2022-07-13_dblp.md b/extra/bulk_edits/2022-07-13_dblp.md new file mode 100644 index 00000000..25405132 --- /dev/null +++ b/extra/bulk_edits/2022-07-13_dblp.md @@ -0,0 +1,114 @@ + +## Prep + + 2022-07-13 05:24:33 (177 KB/s) - ‘dblp.xml.gz’ saved [715701831/715701831] + + Counter({'total': 9186263, 'skip': 9186263, 'has-doi': 4960506, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0}) + 5.71M 3:37:38 [ 437 /s] + + 7.48k 0:38:18 [3.25 /s] + + +## Container Import + +Run 2022-07-15, after a database backup/snapshot. + + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json + # Got 5310 existing dblp container mappings. + # Counter({'total': 7471, 'exists': 7130, 'insert': 341, 'skip': 0, 'update': 0}) + + wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt + 5310 existing_dblp_containers.tsv + 12782 all_dblp_containers.tsv + 7471 dblp_container_meta.json + 7476 prefix_list.txt + + +## Release Import + + export FATCAT_AUTH_WORKER_DBLP=[...] + ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml + # Got 7480 dblp container mappings. + + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/gg/X90 ident=gfvkxubvsfdede7ps4af3oa34q + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/visalg/X88 ident=lvfyrd3lvva3hjuaaokzyoscmm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/msr/PerumaANMO22 ident=2grlescl2bcpvd5yoc4npad3bm + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/dagstuhl/Brodlie97 ident=l6nh222fpjdzfotchu7vfjh6qu + warnings.warn(warn_str) + /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=series/gidiss/2018 ident=x6t7ze4z55enrlq2dnac4qqbve + + Counter({'total': 9186263, 'exists': 5356574, 'has-doi': 4960506, 'skip': 3633039, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'exists-fuzzy': 192376, 'skip-dblp-container-missing': 156477, 'insert': 4216, 'skip-arxiv': 53, 'skip-dblp-id-mismatch': 5, 'skip-title': 1, 'update': 0}) + +NOTE: had to re-try in the middle, so these counts not accurate overall. + +Seems like a large number of `skip-dblp-container-missing`. Maybe should have +re-generated that file differently? + +After this import there are 2,217,670 releases with a dblp ID, and 478,983 with +a dblp ID and no DOI. + + +## Sandcrawler Seedlist Generation + +Almost none of the ~487k dblp releases with no DOI have an associated file. +This implies that no ingest has happened yet, even though the fatcat importer +does parse and filter the "fulltext" URLs out of dblp records. + + cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz + # 631k 0:02:39 [3.96k/s] + + zcat dblp_sandcrawler_ingest_requests.json.gz | jq -r .base_url | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n25 + 43851 ceur-ws.org + 33638 aclanthology.org + 32077 aisel.aisnet.org + 31017 ieeexplore.ieee.org + 26426 dl.acm.org + 23817 hdl.handle.net + 22400 www.isca-speech.org + 20072 tel.archives-ouvertes.fr + 18609 www.aaai.org + 18244 eprint.iacr.org + 15720 ethos.bl.uk + 14727 nbn-resolving.org + 14470 proceedings.mlr.press + 14095 dl.gi.de + 12159 proceedings.neurips.cc + 10890 knowledge.amia.org + 10049 www.usenix.org + 9675 papers.nips.cc + 7541 subs.emis.de + 7396 openaccess.thecvf.com + 7345 mindmodeling.org + 6574 ojs.aaai.org + 5814 www.lrec-conf.org + 5773 search.ndltd.org + 5311 ijcai.org + +This is the first ingest, so let's do some sampling in the 'daily' queue: + + zcat dblp_sandcrawler_ingest_requests.json.gz + + zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n100 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Looks like we can probably get away with doing these in the daily ingest queue, +instead of bulk? Try a larger batch: + + zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n10000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1 + +Nope, these are going to need bulk ingest then follow-up crawling. Will +heritrix crawl along with JALC and DOAJ stuff. + + zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1 + # 631k 0:00:11 [54.0k/s] + + +TODO: +x python or jq transform of JSON objects +x filter out german book/library URLs +x ensure fatcat importer will actually import dblp matches +x test with a small batch in daily or priority queue +- enqueue all in bulk mode, even if processed before? many probably MAG or OAI-PMH previously -- cgit v1.2.3