diff options
Diffstat (limited to 'notes/cleanups/scripts/file_dupe_to_json.py')
-rwxr-xr-x | notes/cleanups/scripts/file_dupe_to_json.py | 72 |
1 files changed, 0 insertions, 72 deletions
diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py deleted file mode 100755 index 2064dc1c..00000000 --- a/notes/cleanups/scripts/file_dupe_to_json.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script can be used to transform duplicate file entity hash export rows -into JSON objects which can be passed to the file entity merger. - -The input is expected to be a TSV with two columns: a hash value in the first -column, and a fatcat file entity ident (in UUID format, not "fatcat ident" -encoded) in the second column. The rows are assumed to be sorted by hash value -(the first column), and duplicate values (same hash, differing UUID) are -contiguous. - -File hashes aren't really "external identifiers" (ext_id), but we treat them as -such here. - -Script is pretty simple, should be possible to copy and reuse for release, -container, creator entity duplicates. -""" - -import json, sys -from typing import Optional -import base64, uuid - -EXTID_TYPE = "sha1" - -def uuid2fcid(s: str) -> str: - """ - Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) - """ - raw = uuid.UUID(s).bytes - return base64.b32encode(raw)[:26].lower().decode("utf-8") - -def print_group(extid, dupe_ids): - if len(dupe_ids) < 2: - return - group = dict( - entity_type="file", - primary_id=None, - duplicate_ids=dupe_ids, - evidence=dict( - extid=extid, - extid_type=EXTID_TYPE, - ), - ) - print(json.dumps(group, sort_keys=True)) - -def run(): - last_extid = None - dupe_ids = [] - for l in sys.stdin: - l = l.strip() - if not l: - continue - (row_extid, row_uuid) = l.split("\t")[0:2] - if EXTID_TYPE == "sha1": - assert len(row_extid) == 40 - else: - raise Exception(f"extid type not supported yet: {EXTID_TYPE}") - row_id = uuid2fcid(row_uuid) - if row_extid == last_extid: - dupe_ids.append(row_id) - continue - elif dupe_ids: - print_group(last_extid, dupe_ids) - last_extid = row_extid - dupe_ids = [row_id] - if last_extid and dupe_ids: - print_group(last_extid, dupe_ids) - - -if __name__=="__main__": - run() |