aboutsummaryrefslogtreecommitdiffstats
path: root/notes/cleanups/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
committerBryan Newbold <bnewbold@robocracy.org>2021-11-29 14:31:26 -0800
commitec2809ef2ac51c992463839c1e3451927f5e1661 (patch)
treed95c1b17e3bd8fc93179551ee130004c73513c16 /notes/cleanups/scripts
parenteb60449cdc9614ec7eda79b8481d1d8487b9a5f6 (diff)
parent487923dc81d877207556f8a90a3ce048fe6bafb5 (diff)
downloadfatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.tar.gz
fatcat-ec2809ef2ac51c992463839c1e3451927f5e1661.zip
Merge branch 'bnewbold-container-merger'
Diffstat (limited to 'notes/cleanups/scripts')
-rwxr-xr-xnotes/cleanups/scripts/container_dupe_to_json.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/notes/cleanups/scripts/container_dupe_to_json.py b/notes/cleanups/scripts/container_dupe_to_json.py
new file mode 100755
index 00000000..2e841c69
--- /dev/null
+++ b/notes/cleanups/scripts/container_dupe_to_json.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+"""
+This script can be used to transform duplicate container entity rows into JSON
+objects which can be passed to the container entity merger.
+
+It is initially used to de-dupe ISSN-Ls. The script is based on
+`file_dupe_to_json.py`.
+"""
+
+import json, sys
+from typing import Optional
+
+EXTID_TYPE = "issnl"
+
+
+def print_group(extid, dupe_ids):
+ if len(dupe_ids) < 2:
+ return
+ group = dict(
+ entity_type="container",
+ primary_id=None,
+ duplicate_ids=dupe_ids,
+ evidence=dict(
+ extid=extid,
+ extid_type=EXTID_TYPE,
+ ),
+ )
+ print(json.dumps(group, sort_keys=True))
+
+def run():
+ last_extid = None
+ dupe_ids = []
+ for l in sys.stdin:
+ l = l.strip()
+ if not l:
+ continue
+ (row_extid, row_id) = l.split("\t")[0:2]
+ if EXTID_TYPE == "issnl":
+ assert len(row_extid) == 9
+ else:
+ raise Exception(f"extid type not supported yet: {EXTID_TYPE}")
+ if row_extid == last_extid:
+ dupe_ids.append(row_id)
+ continue
+ elif dupe_ids:
+ print_group(last_extid, dupe_ids)
+ last_extid = row_extid
+ dupe_ids = [row_id]
+ if last_extid and dupe_ids:
+ print_group(last_extid, dupe_ids)
+
+
+if __name__=="__main__":
+ run()