From c32154f2875a7fb9aac727013e1475cdd811e180 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Mon, 29 Nov 2021 14:34:02 -0800
Subject: move notes/bulk_edits/ to extra/bulk_edits/

---
 extra/bulk_edits/2020-12-14_doaj.md | 139 ++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)
 create mode 100644 extra/bulk_edits/2020-12-14_doaj.md

(limited to 'extra/bulk_edits/2020-12-14_doaj.md')

diff --git a/extra/bulk_edits/2020-12-14_doaj.md b/extra/bulk_edits/2020-12-14_doaj.md
new file mode 100644
index 00000000..5e897183
--- /dev/null
+++ b/extra/bulk_edits/2020-12-14_doaj.md
@@ -0,0 +1,139 @@
+
+## Earlier QA Testing (November 2020)
+
+    export FATCAT_API_AUTH_TOKEN=... (FATCAT_AUTH_WORKER_DOAJ)
+
+    # small test:
+    zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | head | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+    # full run
+    zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+    before: 519.17G
+    after:  542.08G
+
+
+    5.45M 6:29:17 [ 233 /s]
+
+    12x of:
+    Counter({'total': 455504, 'insert': 394437, 'exists': 60615, 'skip': 452, 'skip-title': 452, 'update': 0})
+
+    total:  ~5,466,048
+    insert: ~4,733,244 
+    exists:   ~727,380
+
+Initial imports (before crash) were like:
+
+    Counter({'total': 9339, 'insert': 9330, 'skip': 9, 'skip-title': 9, 'update': 0, 'exists': 0})
+
+Seems like there is a bug, not finding existing by DOI?
+
+## Prod Container Metadata Update (chocula)
+
+Generic update of container metadata using chocula pipeline. Need to run this
+before DOAJ import to ensure we have all the containers already updated.
+
+Also updating ISSN-L index at the same time. Using a 2020-11-19 metadata
+snapshot, which was generated on 2020-12-07; more recent snapshots had small
+upstream changes in some formats so it wasn't trivial to run with a newer
+snapshot.
+
+    # git rev: 9f67c82ce8952bbe9a7a07b732830363c7865485
+
+    # from laptop, then unzip on prod machine
+    scp chocula_fatcat_export.2020-11-19.json.gz fatcat-prod1-vm:/srv/fatcat/datasets/
+
+    # check ISSN-L symlink
+    # ISSN-to-ISSN-L.txt -> 20201119.ISSN-to-ISSN-L.txt
+
+    export FATCAT_AUTH_WORKER_JOURNAL_METADATA=...
+    head -n200 /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json | ./fatcat_import.py chocula -
+    Counter({'total': 200, 'exists': 200, 'exists-by-issnl': 6, 'skip': 0, 'insert': 0, 'update': 0})
+
+    head -n200 /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json | ./fatcat_import.py chocula - --do-updates
+    Counter({'total': 200, 'exists': 157, 'exists-skip-update': 151, 'update': 43, 'exists-by-issnl': 6, 'skip': 0, 'insert': 0})
+
+Some of these are very minor updates, so going to do just creation (no
+`--do-updates`) to start.
+
+    time ./fatcat_import.py chocula /srv/fatcat/datasets/chocula_fatcat_export.2020-11-19.json
+    Counter({'total': 168165, 'exists': 167497, 'exists-by-issnl': 2371, 'insert': 668, 'skip': 0, 'update': 0})
+
+    real    5m37.081s
+    user    3m1.648s
+    sys     0m9.488s
+
+TODO: tweak chocula import script to not update on `extra.state` metadata.
+
+
+## Release Metadata Bulk Import
+
+This is the first production bulk import of DOAJ metadata!
+
+    # git rev: 9f67c82ce8952bbe9a7a07b732830363c7865485
+    # DB before: Size:  678.15G
+
+    # ensure fatcatd is updated to have support for DOAJ identifier
+
+    # create new bot user
+    ./target/release/fatcat-auth create-editor --admin --bot doaj-bot
+    => mir5imb3v5ctxcaqnbstvmri2a
+
+    ./target/release/fatcat-auth create-token mir5imb3v5ctxcaqnbstvmri2a
+    => ...
+
+    # download dataset
+    wget https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2020-11-13.sample_10k.json.gz
+    wget https://archive.org/download/doaj_data_2020-11-13/doaj_article_data_2020-11-13_all.json.gz
+
+    export FATCAT_AUTH_WORKER_DOAJ=...
+
+    # start small
+    zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13.sample_10k.json.gz | head -n100 | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    => Counter({'total': 100, 'exists': 70, 'insert': 30, 'skip': 0, 'update': 0})
+
+That is about expected, in terms of fraction without DOI. However, 6 out of 10
+(randomly checked) of the inserted releases seem to be dupes, which feels too
+high. So going to pause this import until basic fuzzy matching ready from
+Martin's fuzzycat work, and will check against elasticsearch before import.
+Will shuffle the entire file, import in a single thread, and just skip
+importing if there is any fuzzy match (not try to merge/update). Expecting
+about 500k new releases after such filtering.
+
+Ok, on 2020-12-17, back with patches to use fuzzycat in filtering. Trying
+another batch:
+
+    # git rev: 60e022609cd3fbbf9634577149018592e680858d
+    # DB before: Size:  678.47G
+
+    export FATCAT_AUTH_WORKER_DOAJ=...
+
+    zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13.sample_10k.json.gz | head -n1000 | tail -n100 | ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+    => Counter({'total': 100, 'exists': 71, 'insert': 19, 'exists-fuzzy': 10, 'skip': 0, 'update': 0})
+
+    # https://fatcat.wiki/changelog/5033496
+
+Sampled 10x of these and they look much better: no obvious duplication. Going
+ahead with the full import; note that other ingest is happening in parallel
+(many crossref, datacite, and pubmed imports which backed up).
+
+    # full run
+    # note the shuf command added, in an attempt to reduce duplicates within this corpus
+    zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
+
+    # started 2020-12-17 22:01 (Pacific)
+
+    => 5.45M 52:38:45 [28.8 /s]
+    => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0})
+
+As total estimates:
+
+- total: 5,465,832
+- exists: 4,081,180
+- exists-fuzzy: 577,336
+- insert: 800,996
+
+Ending database size: Size:  684.08G
+
+(note that regular imports were running during same period)
+
-- 
cgit v1.2.3