blob: 86cf8fa2e32ddaa2b563ff307b7bf91e9cd4702f (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
import sys
import json
import datetime
def dedupe_file(json_input, json_output):
"""
Takes JSON file of "fatcat enriched" content, and de-dupes based on the
fatcat identifier.
"""
rows = dict()
for l in json_input:
l = json.loads(l)
key = l.get('release_id')
if not key:
continue
if not key in rows:
rows[key] = l
continue
for other_info in ['cord19_paper', 'fatcat_hit',]:
if other_info in l:
rows[key][other_info] = l[other_info]
for k in rows:
print(json.dumps(rows[k], sort_keys=True), file=json_output)
|