From ca3aedcefd5c3835b6fa9b685200ef12d635d22c Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 9 Nov 2021 21:12:54 -0800
Subject: update cleanups notes

---
 notes/cleanups/file_release_ingest_bug.md | 48 +++++++++++++++++++++++++++++++
 notes/cleanups/wayback_timestamps.md      | 24 ++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'notes/cleanups')

diff --git a/notes/cleanups/file_release_ingest_bug.md b/notes/cleanups/file_release_ingest_bug.md
index 8690157a..d818905b 100644
--- a/notes/cleanups/file_release_ingest_bug.md
+++ b/notes/cleanups/file_release_ingest_bug.md
@@ -142,3 +142,51 @@ And create a sample file:
 
     shuf -n10000 /srv/fatcat/snapshots/file_release_bugfix_20211105.json > /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json
 
+
+## Testing in QA
+
+    export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+    head -n10 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 10, 'update': 10, 'skip': 0, 'insert': 0, 'exists': 0})
+
+    file_wmjzuybuorfrjdgnr2d32vg5va: was wrong, now correct, no other files
+    file_wgszcwehlffnzmoypr4l2yhvza: was correct, now correct, no other files. multiple articles in single PDF
+    file_p4r5sbminzgrhn4yaiqyr7ahwi: was correct, now wrong (!!!)
+        doi:10.1055/s-0036-1579844
+        PDF says: 10.4103/0028-3886.158210
+        unpaywall still has this incorrect linkage
+    file_n5jtvrnodbfdbccl5d6hshhvw4: now stub, was wrong
+        doi:10.19080/bboaj.2018.04.555640 release_suvtcm7hdbbr3fczcxxt4thgoi seems correct?
+        doi:10.19080/bboaj.4.4 is the bad DOI, not in fatcat. is registered, as an entire issue
+    file_kt5fv4d5lrbk7j3dxxgcm7hph4: was wrong, now correct
+    file_jq4juugnynhsdkh3whkcjod46q: was wrong, now correct
+    file_ef7w6y7k4jhjhgwmfai37yjjmm: was wrong, now correct
+    file_ca2svhd6knfnff4dktuonp45du: was correct, now correct. complicated, multiple DOIs/copies/files of same work
+    file_borst2aewvcwzhucnyth2vf3lm: was correct, now correct. complicated, multiple DOIs, single file of same work
+
+Overall, seems like this might not be as much of an obvious improvement as
+hoped! But still progress and more correct.
+
+    head -n1000 /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 1000, 'update': 929, 'skip-existing-history-updated': 58, 'skip-existing-fixed': 10, 'skip': 3, 'skip-link-source': 3, 'insert': 0, 'exists': 0})
+
+Looking at `skip-link-source`, it is cases where `link_source` is 'doi' not
+'fatcat-changelog'. Will update filter behavior, 'fatcat-changelog' is a
+`ingest_request_source`.
+
+Checking another 10 examples. They all seem to end up as correct matches.
+
+Did a small update and running the whole batch:
+
+    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | python -m fatcat_tools.cleanups.file_release_bugfix -
+    # Counter({'total': 10000, 'update': 8499, 'skip-existing-fixed': 939, 'skip-existing-history-updated': 560, 'skip': 2, 'skip-wrong-release-is-ok': 2, 'insert': 0, 'exists': 0})
+
+I think this is ready to go! Example with parallel:
+
+    cat /srv/fatcat/snapshots/file_release_bugfix_20211105.10k_sample.json \
+        | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.file_release_bugfix -
+
diff --git a/notes/cleanups/wayback_timestamps.md b/notes/cleanups/wayback_timestamps.md
index 85e5f94f..38bc3f7d 100644
--- a/notes/cleanups/wayback_timestamps.md
+++ b/notes/cleanups/wayback_timestamps.md
@@ -265,3 +265,27 @@ Running in bulk again:
         | gzip \
         > files_20211105_moreshortts.fetched.json.gz
 
+Ran in to one: `requests.exceptions.HTTPError: 503 Server Error: Service
+Temporarily Unavailable for url: [...]`. Will try again, if there are more
+failures may need to split up in smaller chunks.
+
+Unexpected:
+
+    Traceback (most recent call last):
+      File "./fetch_full_cdx_ts.py", line 200, in <module>
+        main()
+      File "./fetch_full_cdx_ts.py", line 197, in main
+        print(json.dumps(process_file(fe, session=session)))
+      File "./fetch_full_cdx_ts.py", line 118, in process_file
+        assert seg[4].isdigit()
+    AssertionError
+    3.96M 3:04:46 [ 357 /s]
+
+Ugh.
+
+    zcat files_20211105_moreshortts.json.gz \
+        | tac \
+        | parallel -j8 --linebuffer --round-robin --pipe ./fetch_full_cdx_ts.py \
+        | pv -l \
+        | gzip \
+        > files_20211105_moreshortts.fetched.json.gz
-- 
cgit v1.2.3