aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts/file_dupe_to_json.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-23 18:58:37 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-23 18:58:37 -0800
commit8080eef139b5dcf6201e4f27076a879d0df20096 (patch)
tree4a19cb90122865bc55f172b5115e9ac052cfdf30 /notes/cleanups/scripts/file_dupe_to_json.py
parentf6c4bd65c104e3a728e94561be80242cf35cbea3 (diff)
downloadfatcat-8080eef139b5dcf6201e4f27076a879d0df20096.tar.gz
fatcat-8080eef139b5dcf6201e4f27076a879d0df20096.zip
file de-dupe: notes on prep and QA testing
Diffstat (limited to 'notes/cleanups/scripts/file_dupe_to_json.py')
-rwxr-xr-xnotes/cleanups/scripts/file_dupe_to_json.py72
1 files changed, 72 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py
new file mode 100755
index 00000000..2064dc1c
--- /dev/null
+++ b/notes/cleanups/scripts/file_dupe_to_json.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+
+"""
+This script can be used to transform duplicate file entity hash export rows
+into JSON objects which can be passed to the file entity merger.
+
+The input is expected to be a TSV with two columns: a hash value in the first
+column, and a fatcat file entity ident (in UUID format, not "fatcat ident"
+encoded) in the second column. The rows are assumed to be sorted by hash value
+(the first column), and duplicate values (same hash, differing UUID) are
+contiguous.
+
+File hashes aren't really "external identifiers" (ext_id), but we treat them as
+such here.
+
+Script is pretty simple, should be possible to copy and reuse for release,
+container, creator entity duplicates.
+"""
+
+import json, sys
+from typing import Optional
+import base64, uuid
+
+EXTID_TYPE = "sha1"
+
+def uuid2fcid(s: str) -> str:
+ """
+ Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
+ """
+ raw = uuid.UUID(s).bytes
+ return base64.b32encode(raw)[:26].lower().decode("utf-8")
+
+def print_group(extid, dupe_ids):
+ if len(dupe_ids) < 2:
+ return
+ group = dict(
+ entity_type="file",
+ primary_id=None,
+ duplicate_ids=dupe_ids,
+ evidence=dict(
+ extid=extid,
+ extid_type=EXTID_TYPE,
+ ),
+ )
+ print(json.dumps(group, sort_keys=True))
+
+def run():
+ last_extid = None
+ dupe_ids = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ (row_extid, row_uuid) = l.split("\t")[0:2]
+ if EXTID_TYPE == "sha1":
+ assert len(row_extid) == 40
+ else:
+ raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
+ row_id = uuid2fcid(row_uuid)
+ if row_extid == last_extid:
+ dupe_ids.append(row_id)
+ continue
+ elif dupe_ids:
+ print_group(last_extid, dupe_ids)
+ last_extid = row_extid
+ dupe_ids = [row_id]
+ if last_extid and dupe_ids:
+ print_group(last_extid, dupe_ids)
+
+
+if __name__=="__main__":
+ run()