aboutsummaryrefslogtreecommitdiffstats
path: root/extra/bulk_edits/2021-05-28_dblp.md
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:34:02 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:34:02 -0800
commitc32154f2875a7fb9aac727013e1475cdd811e180 (patch)
treef0e061498a101fa824995fb6ec9f91e7e44257e1 /extra/bulk_edits/2021-05-28_dblp.md
parentc5ea2dba358624f4c14da0a1a988ae14d0edfd59 (diff)
downloadfatcat-c32154f2875a7fb9aac727013e1475cdd811e180.tar.gz
fatcat-c32154f2875a7fb9aac727013e1475cdd811e180.zip
move notes/bulk_edits/ to extra/bulk_edits/
Diffstat (limited to 'extra/bulk_edits/2021-05-28_dblp.md')
-rw-r--r--extra/bulk_edits/2021-05-28_dblp.md44
1 files changed, 44 insertions, 0 deletions
diff --git a/extra/bulk_edits/2021-05-28_dblp.md b/extra/bulk_edits/2021-05-28_dblp.md
new file mode 100644
index 00000000..061f4f45
--- /dev/null
+++ b/extra/bulk_edits/2021-05-28_dblp.md
@@ -0,0 +1,44 @@
+
+## Container Import
+
+Following dblp README directions:
+
+ export DBLP_DIR=/srv/fatcat/tasks/202105_dblp
+
+ ./fatcat_import.py dblp-release $DBLP_DIR/dblp.xml --dump-json-mode | pv -l > $DBLP_DIR/dblp_releases.json
+ => Counter({'total': 8328073, 'skip': 8328073, 'has-doi': 4478439, 'skip-key-type': 2764750, 'skip-arxiv-corr': 348766, 'skip-title': 1, 'insert': 0, 'update': 0, 'exists': 0})
+ => 5.21M 3:38:35 [ 397 /s]
+
+ cat $DBLP_DIR/dblp_releases.json | jq ._dblp_prefix -r | grep -v ^null | sort -u > $DBLP_DIR/prefix_list.txt
+
+ wc -l $DBLP_DIR/prefix_list.txt
+ => 7603 /srv/fatcat/tasks/202105_dblp/prefix_list.txt
+
+ mkdir -p journals
+ mkdir -p conf
+ mkdir -p series
+
+ shuf $DBLP_DIR/prefix_list.txt | pv -l | parallel -j1 wget -nc -q "https://dblp.org/db/{}/index.html" -O {}.html
+
+ # switch to temporary selectolax pipenv
+ fd html conf/ journals/ series/ | /srv/fatcat/src/extra/dblp/dblp_html_extract.py | pv -l > dblp_container_meta.json
+ => 7.08k 0:00:15 [ 449 /s]
+
+ fatcat-cli search containers dblp_prefix:* -n 0 --index-json | jq "[.dblp_prefix, .ident] | @tsv" -r | pv -l > existing_dblp_containers.tsv
+ => Got 5202 hits in 47ms
+ => 5.20k 0:00:13 [ 375 /s]
+
+ ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file $DBLP_DIR/existing_dblp_containers.tsv --dblp-container-map-output $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp_container_meta.json
+ => Counter({'total': 7083, 'exists': 7025, 'insert': 58, 'skip': 0, 'update': 0})
+ => actually 108 inserted
+
+Actually imported 50 more before this, then Ctrl-C to check. Then re-did
+fatcat-cli query, upload, and re-ran all. So 108 new containers inserted.
+
+## Release Import
+
+With same exports as above:
+
+ ./fatcat_import.py dblp-release --dblp-container-map-file $DBLP_DIR/all_dblp_containers.tsv $DBLP_DIR/dblp.xml
+ => Counter({'total': 8328073, 'exists': 4847353, 'has-doi': 4478439, 'skip': 3259925, 'skip-key-type': 2764750, 'skip-arxiv-corr': 348766, 'exists-fuzzy': 202880, 'skip-dblp-container-missing': 146408, 'insert': 17862, 'skip-arxiv': 53, 'skip-title': 1, 'update': 0})
+