aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts/container_dupe_to_json.py
diff options
context:
space:
mode:
Diffstat (limited to 'notes/cleanups/scripts/container_dupe_to_json.py')
-rwxr-xr-xnotes/cleanups/scripts/container_dupe_to_json.py55
1 files changed, 0 insertions, 55 deletions
diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py
deleted file mode 100755
index 2e841c69..00000000
--- a/notes/cleanups/scripts/container_dupe_to_json.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script can be used to transform duplicate container entity rows into JSON
-objects which can be passed to the container entity merger.
-
-It is initially used to de-dupe ISSN-Ls. The script is based on
-`file_dupe_to_json.py`.
-"""
-
-import json, sys
-from typing import Optional
-
-EXTID_TYPE = "issnl"
-
-
-def print_group(extid, dupe_ids):
- if len(dupe_ids) < 2:
- return
- group = dict(
- entity_type="container",
- primary_id=None,
- duplicate_ids=dupe_ids,
- evidence=dict(
- extid=extid,
- extid_type=EXTID_TYPE,
- ),
- )
- print(json.dumps(group, sort_keys=True))
-
-def run():
- last_extid = None
- dupe_ids = []
- for l in sys.stdin:
- l = l.strip()
- if not l:
- continue
- (row_extid, row_id) = l.split("\t")[0:2]
- if EXTID_TYPE == "issnl":
- assert len(row_extid) == 9
- else:
- raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
- if row_extid == last_extid:
- dupe_ids.append(row_id)
- continue
- elif dupe_ids:
- print_group(last_extid, dupe_ids)
- last_extid = row_extid
- dupe_ids = [row_id]
- if last_extid and dupe_ids:
- print_group(last_extid, dupe_ids)
-
-
-if __name__=="__main__":
- run()