diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 17:49:08 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 17:49:08 -0700 |
commit | 8b3b3e5892a10bf6748c7824549641d20e2447d7 (patch) | |
tree | 493938a53995cf29f5e2f435271c309bd4ce4aa6 /fatcat_covid19/dedupe.py | |
parent | 042bd36c25206ff45e305d094028b6482a4c4074 (diff) | |
download | fatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.tar.gz fatcat-covid19-8b3b3e5892a10bf6748c7824549641d20e2447d7.zip |
add dedupe and query-fatcat commands
Diffstat (limited to 'fatcat_covid19/dedupe.py')
-rw-r--r-- | fatcat_covid19/dedupe.py | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/fatcat_covid19/dedupe.py b/fatcat_covid19/dedupe.py new file mode 100644 index 0000000..86cf8fa --- /dev/null +++ b/fatcat_covid19/dedupe.py @@ -0,0 +1,27 @@ + +import sys +import json +import datetime + + +def dedupe_file(json_input, json_output): + """ + Takes JSON file of "fatcat enriched" content, and de-dupes based on the + fatcat identifier. + """ + rows = dict() + for l in json_input: + l = json.loads(l) + key = l.get('release_id') + if not key: + continue + if not key in rows: + rows[key] = l + continue + for other_info in ['cord19_paper', 'fatcat_hit',]: + if other_info in l: + rows[key][other_info] = l[other_info] + + for k in rows: + print(json.dumps(rows[k], sort_keys=True), file=json_output) + |