blob: 2e841c69d0f907a5023feb965db1573e02956f4c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
#!/usr/bin/env python3
"""
This script can be used to transform duplicate container entity rows into JSON
objects which can be passed to the container entity merger.
It is initially used to de-dupe ISSN-Ls. The script is based on
`file_dupe_to_json.py`.
"""
import json, sys
from typing import Optional
EXTID_TYPE = "issnl"
def print_group(extid, dupe_ids):
if len(dupe_ids) < 2:
return
group = dict(
entity_type="container",
primary_id=None,
duplicate_ids=dupe_ids,
evidence=dict(
extid=extid,
extid_type=EXTID_TYPE,
),
)
print(json.dumps(group, sort_keys=True))
def run():
last_extid = None
dupe_ids = []
for l in sys.stdin:
l = l.strip()
if not l:
continue
(row_extid, row_id) = l.split("\t")[0:2]
if EXTID_TYPE == "issnl":
assert len(row_extid) == 9
else:
raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
if row_extid == last_extid:
dupe_ids.append(row_id)
continue
elif dupe_ids:
print_group(last_extid, dupe_ids)
last_extid = row_extid
dupe_ids = [row_id]
if last_extid and dupe_ids:
print_group(last_extid, dupe_ids)
if __name__=="__main__":
run()
|