aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_covid19/dedupe.py
blob: 86cf8fa2e32ddaa2b563ff307b7bf91e9cd4702f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

import sys
import json
import datetime


def dedupe_file(json_input, json_output):
    """
    Takes JSON file of "fatcat enriched" content, and de-dupes based on the
    fatcat identifier.
    """
    rows = dict()
    for l in json_input:
        l = json.loads(l)
        key = l.get('release_id')
        if not key:
            continue
        if not key in rows:
            rows[key] = l
            continue
        for other_info in ['cord19_paper', 'fatcat_hit',]:
            if other_info in l:
                rows[key][other_info] = l[other_info]

    for k in rows:
        print(json.dumps(rows[k], sort_keys=True), file=json_output)