extra/cleanups/scripts/file_dupe_to_json.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

#!/usr/bin/env python3

"""
This script can be used to transform duplicate file entity hash export rows
into JSON objects which can be passed to the file entity merger.

The input is expected to be a TSV with two columns: a hash value in the first
column, and a fatcat file entity ident (in UUID format, not "fatcat ident"
encoded) in the second column. The rows are assumed to be sorted by hash value
(the first column), and duplicate values (same hash, differing UUID) are
contiguous.

File hashes aren't really "external identifiers" (ext_id), but we treat them as
such here.

Script is pretty simple, should be possible to copy and reuse for release,
container, creator entity duplicates.
"""

import json, sys
from typing import Optional
import base64, uuid

EXTID_TYPE = "sha1"

def uuid2fcid(s: str) -> str:
    """
    Converts a uuid.UUID object to a fatcat identifier (base32 encoded string)
    """     
    raw = uuid.UUID(s).bytes
    return base64.b32encode(raw)[:26].lower().decode("utf-8")

def print_group(extid, dupe_ids):
    if len(dupe_ids) < 2:
        return
    group = dict(
        entity_type="file",
        primary_id=None,
        duplicate_ids=dupe_ids,
        evidence=dict(
            extid=extid,
            extid_type=EXTID_TYPE,
        ),
    )
    print(json.dumps(group, sort_keys=True))

def run():
    last_extid = None
    dupe_ids = []
    for l in sys.stdin:
        l = l.strip()
        if not l:
            continue
        (row_extid, row_uuid) = l.split("\t")[0:2]
        if EXTID_TYPE == "sha1":
            assert len(row_extid) == 40
        else:
            raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
        row_id = uuid2fcid(row_uuid)
        if row_extid == last_extid:
            dupe_ids.append(row_id)
            continue
        elif dupe_ids:
            print_group(last_extid, dupe_ids)
        last_extid = row_extid
        dupe_ids = [row_id]
    if last_extid and dupe_ids:
        print_group(last_extid, dupe_ids)


if __name__=="__main__":
    run()