From 8b3b3e5892a10bf6748c7824549641d20e2447d7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Apr 2020 17:49:08 -0700 Subject: add dedupe and query-fatcat commands --- fatcat_covid19/dedupe.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 fatcat_covid19/dedupe.py (limited to 'fatcat_covid19/dedupe.py') diff --git a/fatcat_covid19/dedupe.py b/fatcat_covid19/dedupe.py new file mode 100644 index 0000000..86cf8fa --- /dev/null +++ b/fatcat_covid19/dedupe.py @@ -0,0 +1,27 @@ + +import sys +import json +import datetime + + +def dedupe_file(json_input, json_output): + """ + Takes JSON file of "fatcat enriched" content, and de-dupes based on the + fatcat identifier. + """ + rows = dict() + for l in json_input: + l = json.loads(l) + key = l.get('release_id') + if not key: + continue + if not key in rows: + rows[key] = l + continue + for other_info in ['cord19_paper', 'fatcat_hit',]: + if other_info in l: + rows[key][other_info] = l[other_info] + + for k in rows: + print(json.dumps(rows[k], sort_keys=True), file=json_output) + -- cgit v1.2.3