summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-09 16:56:28 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-09 16:56:28 -0800
commitd9284d421618742f5ecd76ba2c6d92dcefa5e5db (patch)
tree934fb0c4df6cf1e32df09840a60c08199658e183
parent78eb2cc0f3f81c35474eb68231fe5f60d47bbcde (diff)
downloadfatcat-d9284d421618742f5ecd76ba2c6d92dcefa5e5db.tar.gz
fatcat-d9284d421618742f5ecd76ba2c6d92dcefa5e5db.zip
updates to lowercase DOI cleanup
-rw-r--r--notes/cleanups/case_sensitive_dois.md71
-rw-r--r--python/fatcat_tools/cleanups/release_lowercase_doi.py22
2 files changed, 86 insertions, 7 deletions
diff --git a/notes/cleanups/case_sensitive_dois.md b/notes/cleanups/case_sensitive_dois.md
new file mode 100644
index 00000000..1bf1901e
--- /dev/null
+++ b/notes/cleanups/case_sensitive_dois.md
@@ -0,0 +1,71 @@
+
+Relevant github issue: https://github.com/internetarchive/fatcat/issues/83
+
+How many existing fatcat releases have a non-lowercase DOI? As of June 2021:
+
+ zcat release_extid.tsv.gz | cut -f3 | rg '[A-Z]' | pv -l | wc -l
+ 139964
+
+## Prep
+
+ wget https://archive.org/download/fatcat_bulk_exports_2021-11-05/release_extid.tsv.gz
+
+ # scratch:bin/fcid.py is roughly the same as `fatcat_util.py uuid2fcid`
+
+ zcat release_extid.tsv.gz \
+ | cut -f1,3 \
+ | rg '[A-Z]' \
+ | /fast/scratch/bin/fcid.py \
+ | pv -l \
+ > nonlowercase_doi_releases.tsv
+ # 140k 0:03:54 [ 599 /s]
+
+ wc -l nonlowercase_doi_releases.tsv
+ 140530 nonlowercase_doi_releases.tsv
+
+Uhoh, there are ~500 more than previously? Guess those are from after the fix?
+
+Create a sample for testing:
+
+ shuf -n10000 nonlowercase_doi_releases.tsv \
+ > nonlowercase_doi_releases.10k_sample.tsv
+
+## Test in QA
+
+In pipenv:
+
+ export FATCAT_AUTH_WORKER_CLEANUP=[...]
+
+ head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+ | python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # Counter({'total': 100, 'update': 100, 'skip': 0, 'insert': 0, 'exists': 0})
+
+ head -n100 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+ | python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # Counter({'total': 100, 'skip-existing-doi-fine': 100, 'skip': 0, 'insert': 0, 'update': 0, 'exists': 0})
+
+ head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+ | python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # no such release_ident found: dcjsybvqanffhmu4dhzdnptave
+
+Presumably because this is being run in QA, and there are some newer prod releases in the snapshot.
+
+Did a quick update, and then:
+
+ head -n2000 /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+ | python -m fatcat_tools.cleanups.release_lowercase_doi -
+ # Counter({'total': 2000, 'skip-existing-doi-fine': 1100, 'update': 898, 'skip-existing-not-found': 2, 'skip': 0, 'insert': 0, 'exists': 0})
+
+Did some spot checking in QA. Out of 20 DOIs checked, 15 were valid, 5 were not
+valid (doi.org 404). It seems like roughly 1/3 have a dupe DOI (the lower-case
+DOI exists); didn't count exact numbers.
+
+This cleanup is simple and looks good to go. Batch size of 50 is good for full
+releases.
+
+Example of parallelization:
+
+ cat /srv/fatcat/datasets/nonlowercase_doi_releases.10k_sample.tsv \
+ | parallel -j8 --linebuffer --round-robin --pipe python -m fatcat_tools.cleanups.release_lowercase_doi -
+
+Ready to go!
diff --git a/python/fatcat_tools/cleanups/release_lowercase_doi.py b/python/fatcat_tools/cleanups/release_lowercase_doi.py
index 4e5a6f43..5e3275db 100644
--- a/python/fatcat_tools/cleanups/release_lowercase_doi.py
+++ b/python/fatcat_tools/cleanups/release_lowercase_doi.py
@@ -2,7 +2,7 @@ import argparse
import os
import sys
-from fatcat_openapi_client import ApiClient, ReleaseEntity, ReleaseExtIds
+from fatcat_openapi_client import ApiClient, ApiException, ReleaseEntity, ReleaseExtIds
from fatcat_tools import authenticated_api, public_api
from fatcat_tools.importers.common import EntityImporter, LinePusher
@@ -45,7 +45,10 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
self.testing_mode = False
def want(self, row: str) -> bool:
- row = row.strip().split()[0]
+ row = row.strip()
+ if not row:
+ return False
+ row = row.split()[0]
if len(row) == 26:
return True
else:
@@ -66,15 +69,20 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
def try_update(self, re: ReleaseEntity) -> bool:
- # should always be existing
- existing = self.api.get_release(re.ident)
+ # should always be existing, but sometimes not because of prod/QA flip
+ existing = None
+ try:
+ existing = self.api.get_release(re.ident)
+ except ApiException as err:
+ if err.status != 404:
+ raise err
if not existing:
self.counts["skip-existing-not-found"] += 1
return False
- if existing.status != "active":
- self.counts["skip-existing-entity-status"] += 1
+ if existing.state != "active":
+ self.counts["skip-existing-entity-state"] += 1
return False
if not existing.ext_ids.doi:
@@ -91,7 +99,7 @@ class ReleaseLowercaseDoiCleanup(EntityImporter):
# these corrections (entity dump) contains no dupes
if not self.testing_mode:
- self.api.update_release(self.get_editgroup_id(), re.ident, re)
+ self.api.update_release(self.get_editgroup_id(), existing.ident, existing)
self.counts["update"] += 1
return False