diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 18:22:06 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-24 18:22:06 -0800 |
commit | 487923dc81d877207556f8a90a3ce048fe6bafb5 (patch) | |
tree | 2b1fe9915e0447b20f20df570d150d7be622a89a /notes/cleanups/scripts | |
parent | d1c8a582d31dc6f3254e477774aea0fa75fc8b23 (diff) | |
download | fatcat-487923dc81d877207556f8a90a3ce048fe6bafb5.tar.gz fatcat-487923dc81d877207556f8a90a3ce048fe6bafb5.zip |
notes on container ISSN-L merging, tested in QA
Diffstat (limited to 'notes/cleanups/scripts')
-rwxr-xr-x | notes/cleanups/scripts/container_dupe_to_json.py | 55 |
1 files changed, 55 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py new file mode 100755 index 00000000..2e841c69 --- /dev/null +++ b/notes/cleanups/scripts/container_dupe_to_json.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +""" +This script can be used to transform duplicate container entity rows into JSON +objects which can be passed to the container entity merger. + +It is initially used to de-dupe ISSN-Ls. The script is based on +`file_dupe_to_json.py`. +""" + +import json, sys +from typing import Optional + +EXTID_TYPE = "issnl" + + +def print_group(extid, dupe_ids): + if len(dupe_ids) < 2: + return + group = dict( + entity_type="container", + primary_id=None, + duplicate_ids=dupe_ids, + evidence=dict( + extid=extid, + extid_type=EXTID_TYPE, + ), + ) + print(json.dumps(group, sort_keys=True)) + +def run(): + last_extid = None + dupe_ids = [] + for l in sys.stdin: + l = l.strip() + if not l: + continue + (row_extid, row_id) = l.split("\t")[0:2] + if EXTID_TYPE == "issnl": + assert len(row_extid) == 9 + else: + raise Exception(f"extid type not supported yet: {EXTID_TYPE}") + if row_extid == last_extid: + dupe_ids.append(row_id) + continue + elif dupe_ids: + print_group(last_extid, dupe_ids) + last_extid = row_extid + dupe_ids = [row_id] + if last_extid and dupe_ids: + print_group(last_extid, dupe_ids) + + +if __name__=="__main__": + run() |