aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts/container_dupe_to_json.py
blob: 2e841c69d0f907a5023feb965db1573e02956f4c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3

"""
This script can be used to transform duplicate container entity rows into JSON
objects which can be passed to the container entity merger.

It is initially used to de-dupe ISSN-Ls. The script is based on
`file_dupe_to_json.py`.
"""

import json, sys
from typing import Optional

EXTID_TYPE = "issnl"


def print_group(extid, dupe_ids):
    if len(dupe_ids) < 2:
        return
    group = dict(
        entity_type="container",
        primary_id=None,
        duplicate_ids=dupe_ids,
        evidence=dict(
            extid=extid,
            extid_type=EXTID_TYPE,
        ),
    )
    print(json.dumps(group, sort_keys=True))

def run():
    last_extid = None
    dupe_ids = []
    for l in sys.stdin:
        l = l.strip()
        if not l:
            continue
        (row_extid, row_id) = l.split("\t")[0:2]
        if EXTID_TYPE == "issnl":
            assert len(row_extid) == 9
        else:
            raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
        if row_extid == last_extid:
            dupe_ids.append(row_id)
            continue
        elif dupe_ids:
            print_group(last_extid, dupe_ids)
        last_extid = row_extid
        dupe_ids = [row_id]
    if last_extid and dupe_ids:
        print_group(last_extid, dupe_ids)


if __name__=="__main__":
    run()