From 75bde4ad3970e8e63b04009cfd16ed4b9a924ce7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 24 Nov 2021 17:05:08 -0800 Subject: notes from prod run of file de-dupe --- notes/bulk_edits/2021-11-24_file_sha1_dedupe.md | 35 +++++++++++++++++++++++++ notes/bulk_edits/CHANGELOG.md | 1 + 2 files changed, 36 insertions(+) create mode 100644 notes/bulk_edits/2021-11-24_file_sha1_dedupe.md diff --git a/notes/bulk_edits/2021-11-24_file_sha1_dedupe.md b/notes/bulk_edits/2021-11-24_file_sha1_dedupe.md new file mode 100644 index 00000000..012bcf62 --- /dev/null +++ b/notes/bulk_edits/2021-11-24_file_sha1_dedupe.md @@ -0,0 +1,35 @@ + +See notes and scripts about `file_sha1_dedupe` cleanup for prep details. + +## Prod Run + +Run as `cleanup-bot`: + + export FATCAT_AUTH_API_TOKEN=[...] + + git log | head -n1 + # commit 5bc5eeed5e3ba54c2129c4233b881291c5fa7449 + +First do a sample in dry-run mode: + + head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \ + | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" --dry-run merge-files - + # Counter({'updated-entities': 59, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0}) + +Gah, the dry-run mode still creates (empty) editgroups: + + https://fatcat.wiki/editgroup/iqzjg3vxu5elvotknmmjln3gv4 + https://fatcat.wiki/editgroup/2mxsl7lxo5dezem42whnr7zxxe + +Actually run (merge) the sample: + + head -n25 /srv/fatcat/datasets/file_sha1_dupes.json \ + | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files - + # Counter({'updated-entities': 59, 'lines': 25, 'merged': 25, 'skip': 0, 'updated-total': 0}) + + +Run the full batch: + + cat /srv/fatcat/datasets/file_sha1_dupes.json \ + | python -m fatcat_tools.mergers.files --editgroup-description-override "Automated merging of file entities with duplicate SHA-1 hashes" merge-files - + # Counter({'updated-entities': 6197, 'lines': 2039, 'merged': 2014, 'skip': 25, 'skip-not-active-entity': 25, 'updated-total': 0}) diff --git a/notes/bulk_edits/CHANGELOG.md b/notes/bulk_edits/CHANGELOG.md index d82e126e..cf5d9829 100644 --- a/notes/bulk_edits/CHANGELOG.md +++ b/notes/bulk_edits/CHANGELOG.md @@ -25,6 +25,7 @@ and specific final commands in this directory. Quick summary: import-time code bug, were fixed. a couple hundred questionable cases remain, but are all mismatched due to DOI slash/double-slash issues and will not be fixed in an automated way. +- de-uplicated a few thousand file entities, on the basis of SHA-1 hash ## 2021-06 -- cgit v1.2.3