1 files changed, 0 insertions, 72 deletions
diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py
deleted file mode 100755
index 2064dc1c..00000000
--- a/notes/cleanups/scripts/file_dupe_to_json.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script can be used to transform duplicate file entity hash export rows
-into JSON objects which can be passed to the file entity merger.
-
-The input is expected to be a TSV with two columns: a hash value in the first
-column, and a fatcat file entity ident (in UUID format, not "fatcat ident"
-encoded) in the second column. The rows are assumed to be sorted by hash value
-(the first column), and duplicate values (same hash, differing UUID) are
-contiguous.
-
-File hashes aren't really "external identifiers" (ext_id), but we treat them as
-such here.
-
-Script is pretty simple, should be possible to copy and reuse for release,
-container, creator entity duplicates.
-"""
-
-import json, sys
-from typing import Optional
-import base64, uuid
-
-EXTID_TYPE = "sha1"
-
-def uuid2fcid(s: str) -> str:
-    """
-    Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
-    """     
-    raw = uuid.UUID(s).bytes
-    return base64.b32encode(raw)[:26].lower().decode("utf-8")
-
-def print_group(extid, dupe_ids):
-    if len(dupe_ids) < 2:
-        return
-    group = dict(
-        entity_type="file",
-        primary_id=None,
-        duplicate_ids=dupe_ids,
-        evidence=dict(
-            extid=extid,
-            extid_type=EXTID_TYPE,
-        ),
-    )
-    print(json.dumps(group, sort_keys=True))
-
-def run():
-    last_extid = None
-    dupe_ids = []
-    for l in sys.stdin:
-        l = l.strip()
-        if not l:
-            continue
-        (row_extid, row_uuid) = l.split("\t")[0:2]
-        if EXTID_TYPE == "sha1":
-            assert len(row_extid) == 40
-        else:
-            raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
-        row_id = uuid2fcid(row_uuid)
-        if row_extid == last_extid:
-            dupe_ids.append(row_id)
-            continue
-        elif dupe_ids:
-            print_group(last_extid, dupe_ids)
-        last_extid = row_extid
-        dupe_ids = [row_id]
-    if last_extid and dupe_ids:
-        print_group(last_extid, dupe_ids)
-
-
-if __name__=="__main__":
-    run()