diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 18:58:37 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-23 18:58:37 -0800 |
commit | 8080eef139b5dcf6201e4f27076a879d0df20096 (patch) | |
tree | 4a19cb90122865bc55f172b5115e9ac052cfdf30 /notes/cleanups/scripts/file_dupe_to_json.py | |
parent | f6c4bd65c104e3a728e94561be80242cf35cbea3 (diff) | |
download | fatcat-8080eef139b5dcf6201e4f27076a879d0df20096.tar.gz fatcat-8080eef139b5dcf6201e4f27076a879d0df20096.zip |
file de-dupe: notes on prep and QA testing
Diffstat (limited to 'notes/cleanups/scripts/file_dupe_to_json.py')
-rwxr-xr-x | notes/cleanups/scripts/file_dupe_to_json.py | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/file_dupe_to_json.py b/notes/cleanups/scripts/file_dupe_to_json.py new file mode 100755 index 00000000..2064dc1c --- /dev/null +++ b/notes/cleanups/scripts/file_dupe_to_json.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate file entity hash export rows +into JSON objects which can be passed to the file entity merger. + +The input is expected to be a TSV with two columns: a hash value in the first +column, and a fatcat file entity ident (in UUID format, not "fatcat ident" +encoded) in the second column. The rows are assumed to be sorted by hash value +(the first column), and duplicate values (same hash, differing UUID) are +contiguous. + +File hashes aren't really "external identifiers" (ext_id), but we treat them as +such here. + +Script is pretty simple, should be possible to copy and reuse for release, +container, creator entity duplicates. +""" + +import json, sys +from typing import Optional +import base64, uuid + +EXTID_TYPE = "sha1" + +def uuid2fcid(s: str) -> str: + """ + Converts a uuid.UUID object to a fatcat identifier (base32 encoded string) + """ + raw = uuid.UUID(s).bytes + return base64.b32encode(raw)[:26].lower().decode("utf-8") + +def print_group(extid, dupe_ids): + if len(dupe_ids) < 2: + return + group = dict( + entity_type="file", + primary_id=None, + duplicate_ids=dupe_ids, + evidence=dict( + extid=extid, + extid_type=EXTID_TYPE, + ), + ) + print(json.dumps(group, sort_keys=True)) + +def run(): + last_extid = None + dupe_ids = [] + for l in sys.stdin: + l = l.strip() + if not l: + continue + (row_extid, row_uuid) = l.split("\t")[0:2] + if EXTID_TYPE == "sha1": + assert len(row_extid) == 40 + else: + raise Exception(f"extid type not supported yet: {EXTID_TYPE}") + row_id = uuid2fcid(row_uuid) + if row_extid == last_extid: + dupe_ids.append(row_id) + continue + elif dupe_ids: + print_group(last_extid, dupe_ids) + last_extid = row_extid + dupe_ids = [row_id] + if last_extid and dupe_ids: + print_group(last_extid, dupe_ids) + + +if __name__=="__main__": + run() |