aboutsummaryrefslogtreecommitdiffstats
path: root/extra/bulk_edits/2022-07-13_dblp.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:48:15 -0700
committerBryan Newbold <bnewbold@robocracy.org>2022-07-19 12:48:15 -0700
commitb12d4f0bde96bfe39df1cc94a993da4b25e53304 (patch)
treebbd6d5db7cfa61d76b4f8bfabe80102cf4d07db7 /extra/bulk_edits/2022-07-13_dblp.md
parent519733b77832ccbf97491a794e7f10884e39acdb (diff)
downloadfatcat-b12d4f0bde96bfe39df1cc94a993da4b25e53304.tar.gz
fatcat-b12d4f0bde96bfe39df1cc94a993da4b25e53304.zip
dblp import/update notes
Diffstat (limited to 'extra/bulk_edits/2022-07-13_dblp.md')
-rw-r--r--extra/bulk_edits/2022-07-13_dblp.md114
1 files changed, 114 insertions, 0 deletions
diff --git a/extra/bulk_edits/2022-07-13_dblp.md b/extra/bulk_edits/2022-07-13_dblp.md
new file mode 100644
index 00000000..25405132
--- /dev/null
+++ b/extra/bulk_edits/2022-07-13_dblp.md
@@ -0,0 +1,114 @@
+
+## Prep
+
+ 2022-07-13 05:24:33 (177 KB/s) - ‘dblp.xml.gz’ saved [715701831/715701831]
+
+ Counter({'total': 9186263, 'skip': 9186263, 'has-doi': 4960506, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0})
+ 5.71M 3:37:38 [ 437 /s]
+
+ 7.48k 0:38:18 [3.25 /s]
+
+
+## Container Import
+
+Run 2022-07-15, after a database backup/snapshot.
+
+ export FATCAT_AUTH_WORKER_DBLP=[...]
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file ../extra/dblp/existing_dblp_containers.tsv --dblp-container-map-output ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp_container_meta.json
+ # Got 5310 existing dblp container mappings.
+ # Counter({'total': 7471, 'exists': 7130, 'insert': 341, 'skip': 0, 'update': 0})
+
+ wc -l existing_dblp_containers.tsv all_dblp_containers.tsv dblp_container_meta.json prefix_list.txt
+ 5310 existing_dblp_containers.tsv
+ 12782 all_dblp_containers.tsv
+ 7471 dblp_container_meta.json
+ 7476 prefix_list.txt
+
+
+## Release Import
+
+ export FATCAT_AUTH_WORKER_DBLP=[...]
+ ./fatcat_import.py dblp-release --dblp-container-map-file ../extra/dblp/all_dblp_containers.tsv ../extra/dblp/dblp.xml
+ # Got 7480 dblp container mappings.
+
+ /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/gg/X90 ident=gfvkxubvsfdede7ps4af3oa34q
+ warnings.warn(warn_str)
+ /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/visalg/X88 ident=lvfyrd3lvva3hjuaaokzyoscmm
+ warnings.warn(warn_str)
+ /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/msr/PerumaANMO22 ident=2grlescl2bcpvd5yoc4npad3bm
+ warnings.warn(warn_str)
+ /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=conf/dagstuhl/Brodlie97 ident=l6nh222fpjdzfotchu7vfjh6qu
+ warnings.warn(warn_str)
+ /1/srv/fatcat/src/python/fatcat_tools/importers/dblp_release.py:358: UserWarning: unexpected dblp ext_id match after lookup failed dblp=series/gidiss/2018 ident=x6t7ze4z55enrlq2dnac4qqbve
+
+ Counter({'total': 9186263, 'exists': 5356574, 'has-doi': 4960506, 'skip': 3633039, 'skip-key-type': 3037457, 'skip-arxiv-corr': 439104, 'exists-fuzzy': 192376, 'skip-dblp-container-missing': 156477, 'insert': 4216, 'skip-arxiv': 53, 'skip-dblp-id-mismatch': 5, 'skip-title': 1, 'update': 0})
+
+NOTE: had to re-try in the middle, so these counts not accurate overall.
+
+Seems like a large number of `skip-dblp-container-missing`. Maybe should have
+re-generated that file differently?
+
+After this import there are 2,217,670 releases with a dblp ID, and 478,983 with
+a dblp ID and no DOI.
+
+
+## Sandcrawler Seedlist Generation
+
+Almost none of the ~487k dblp releases with no DOI have an associated file.
+This implies that no ingest has happened yet, even though the fatcat importer
+does parse and filter the "fulltext" URLs out of dblp records.
+
+ cat dblp_releases_partial.json | pipenv run ./dblp2ingestrequest.py - | pv -l | gzip > dblp_sandcrawler_ingest_requests.json.gz
+ # 631k 0:02:39 [3.96k/s]
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | jq -r .base_url | cut -f3 -d/ | sort | uniq -c | sort -nr | head -n25
+ 43851 ceur-ws.org
+ 33638 aclanthology.org
+ 32077 aisel.aisnet.org
+ 31017 ieeexplore.ieee.org
+ 26426 dl.acm.org
+ 23817 hdl.handle.net
+ 22400 www.isca-speech.org
+ 20072 tel.archives-ouvertes.fr
+ 18609 www.aaai.org
+ 18244 eprint.iacr.org
+ 15720 ethos.bl.uk
+ 14727 nbn-resolving.org
+ 14470 proceedings.mlr.press
+ 14095 dl.gi.de
+ 12159 proceedings.neurips.cc
+ 10890 knowledge.amia.org
+ 10049 www.usenix.org
+ 9675 papers.nips.cc
+ 7541 subs.emis.de
+ 7396 openaccess.thecvf.com
+ 7345 mindmodeling.org
+ 6574 ojs.aaai.org
+ 5814 www.lrec-conf.org
+ 5773 search.ndltd.org
+ 5311 ijcai.org
+
+This is the first ingest, so let's do some sampling in the 'daily' queue:
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n100 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Looks like we can probably get away with doing these in the daily ingest queue,
+instead of bulk? Try a larger batch:
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | shuf -n10000 | rg -v "\\\\" | jq . -c | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-daily -p -1
+
+Nope, these are going to need bulk ingest then follow-up crawling. Will
+heritrix crawl along with JALC and DOAJ stuff.
+
+ zcat dblp_sandcrawler_ingest_requests.json.gz | rg -v "\\\\" | jq . -c | pv -l | kafkacat -P -b wbgrp-svc350.us.archive.org -t sandcrawler-prod.ingest-file-requests-bulk -p -1
+ # 631k 0:00:11 [54.0k/s]
+
+
+TODO:
+x python or jq transform of JSON objects
+x filter out german book/library URLs
+x ensure fatcat importer will actually import dblp matches
+x test with a small batch in daily or priority queue
+- enqueue all in bulk mode, even if processed before? many probably MAG or OAI-PMH previously