aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19/dedupe.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-09 17:49:08 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-09 17:49:08 -0700
commit8b3b3e5892a10bf6748c7824549641d20e2447d7 (patch)
tree493938a53995cf29f5e2f435271c309bd4ce4aa6 /fatcat_covid19/dedupe.py
parent042bd36c25206ff45e305d094028b6482a4c4074 (diff)
downloadfatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.tar.gz
fatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.zip
add dedupe and query-fatcat commands
Diffstat (limited to 'fatcat_covid19/dedupe.py')
-rw-r--r--fatcat_covid19/dedupe.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/fatcat_covid19/dedupe.py b/fatcat_covid19/dedupe.py
new file mode 100644
index 0000000..86cf8fa
--- /dev/null
+++ b/fatcat_covid19/dedupe.py
@@ -0,0 +1,27 @@
+
+import sys
+import json
+import datetime
+
+
+def dedupe_file(json_input, json_output):
+ """
+ Takes JSON file of "fatcat enriched" content, and de-dupes based on the
+ fatcat identifier.
+ """
+ rows = dict()
+ for l in json_input:
+ l = json.loads(l)
+ key = l.get('release_id')
+ if not key:
+ continue
+ if not key in rows:
+ rows[key] = l
+ continue
+ for other_info in ['cord19_paper', 'fatcat_hit',]:
+ if other_info in l:
+ rows[key][other_info] = l[other_info]
+
+ for k in rows:
+ print(json.dumps(rows[k], sort_keys=True), file=json_output)
+