From c32154f2875a7fb9aac727013e1475cdd811e180 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 14:34:02 -0800 Subject: move notes/bulk_edits/ to extra/bulk_edits/ --- notes/bulk_edits/2020-12-23_dblp.md | 55 ------------------------------------- 1 file changed, 55 deletions(-) delete mode 100644 notes/bulk_edits/2020-12-23_dblp.md (limited to 'notes/bulk_edits/2020-12-23_dblp.md') diff --git a/notes/bulk_edits/2020-12-23_dblp.md b/notes/bulk_edits/2020-12-23_dblp.md deleted file mode 100644 index a33411cb..00000000 --- a/notes/bulk_edits/2020-12-23_dblp.md +++ /dev/null @@ -1,55 +0,0 @@ - -## Prod Container Import - -Using 2020-11-30 XML dump, then scrape and transform tooling from -`extra/dblp/`. - - wget https://archive.org/download/dblp-xml-2020-11-30/dblp_container_meta.json - - # updated ISSN-to-ISSN-L.txt symlink to 20201207.ISSN-to-ISSN-L.txt - - touch /srv/fatcat/datasets/blank_dblp_containers.tsv - -Create new `dblp-bot` user: - - ./target/release/fatcat-auth create-editor --admin --bot dblp-bot - => gwbheb5jfngrxkcad5qgth5cra - - ./target/release/fatcat-auth create-token gwbheb5jfngrxkcad5qgth5cra - -Run import: - - # git commit: ec6b366af8df1956e1287cba2e0818b80ce1c518 - - export FATCAT_AUTH_WORKER_DBLP=... - - ./fatcat_import.py dblp-container --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt --dblp-container-map-file /srv/fatcat/datasets/blank_dblp_containers.tsv --dblp-container-map-output /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp_container_meta.json - => Got 0 existing dblp container mappings. - => Counter({'total': 6954, 'insert': 5202, 'exists': 1752, 'skip': 0, 'update': 0}) - - wc -l /srv/fatcat/datasets/all_dblp_containers.tsv - 6955 /srv/fatcat/datasets/all_dblp_containers.tsv - -## Prod Release Import - -Using same 2020-11-30 XML dump. Download to /srv/fatcat/datasets: - - wget https://archive.org/download/dblp-xml-2020-11-30/dblp.dtd - wget https://archive.org/download/dblp-xml-2020-11-30/dblp.xml - -Run import: - - export FATCAT_AUTH_WORKER_DBLP=... - - ./fatcat_import.py dblp-release --dblp-container-map-file /srv/fatcat/datasets/all_dblp_containers.tsv /srv/fatcat/datasets/dblp.xml --do-updates - - # started 2020-12-23 11:51 (Pacific) - - # restarted/tweaked at least twice - - # finally ended around 2020-12-27 after about... 48 hours? - - => Counter({'total': 7953365, 'has-doi': 4277307, 'skip': 3097418, 'skip-key-type': 2640968, 'skip-update': 2480449, 'exists': 943800, 'update': 889700, 'insert': 338842, 'skip-arxiv-corr': 312872, 'exists-fuzzy': 203103, 'skip-dblp-container-missing': 143578, 'skip-arxiv': 53, 'skip-title': 1}) - -Starting database size (roughly): Size: 684.08G -Ending database size: Size: 690.22G -- cgit v1.2.3