From c5ea2dba358624f4c14da0a1a988ae14d0edfd59 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 29 Nov 2021 14:33:14 -0800 Subject: move 'cleanups' directory from notes to extra/ --- notes/cleanups/scripts/container_dupe_to_json.py | 55 ------------------------ 1 file changed, 55 deletions(-) delete mode 100755 notes/cleanups/scripts/container_dupe_to_json.py (limited to 'notes/cleanups/scripts/container_dupe_to_json.py') diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py deleted file mode 100755 index 2e841c69..00000000 --- a/notes/cleanups/scripts/container_dupe_to_json.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -""" -This script can be used to transform duplicate container entity rows into JSON -objects which can be passed to the container entity merger. - -It is initially used to de-dupe ISSN-Ls. The script is based on -`file_dupe_to_json.py`. -""" - -import json, sys -from typing import Optional - -EXTID_TYPE = "issnl" - - -def print_group(extid, dupe_ids): - if len(dupe_ids) < 2: - return - group = dict( - entity_type="container", - primary_id=None, - duplicate_ids=dupe_ids, - evidence=dict( - extid=extid, - extid_type=EXTID_TYPE, - ), - ) - print(json.dumps(group, sort_keys=True)) - -def run(): - last_extid = None - dupe_ids = [] - for l in sys.stdin: - l = l.strip() - if not l: - continue - (row_extid, row_id) = l.split("\t")[0:2] - if EXTID_TYPE == "issnl": - assert len(row_extid) == 9 - else: - raise Exception(f"extid type not supported yet: {EXTID_TYPE}") - if row_extid == last_extid: - dupe_ids.append(row_id) - continue - elif dupe_ids: - print_group(last_extid, dupe_ids) - last_extid = row_extid - dupe_ids = [row_id] - if last_extid and dupe_ids: - print_group(last_extid, dupe_ids) - - -if __name__=="__main__": - run() -- cgit v1.2.3