diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-05 23:24:10 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-05 23:24:10 -0800 |
commit | a38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d (patch) | |
tree | 207cfe78c9bd50bd7124f4e95383ed46cc37c094 | |
parent | d4233fe651043b6bb8175bb0d22fbab95b11fb70 (diff) | |
download | sandcrawler-a38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d.tar.gz sandcrawler-a38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d.zip |
add ingestrequest_row2json.py
-rwxr-xr-x | python/scripts/ingestrequest_row2json.py | 48 |
1 files changed, 48 insertions, 0 deletions
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py new file mode 100755 index 0000000..065dd3b --- /dev/null +++ b/python/scripts/ingestrequest_row2json.py @@ -0,0 +1,48 @@ +#!/usr/bin/python3 + +""" +This script is used to turn ingest request postgres rows (in JSON export +format) back in to regular ingest request JSON. + +The only difference is the name and location of some optional keys. +""" + +import sys +import json +import argparse + + +def transform(row): + """ + dict-to-dict + """ + row.pop('created', None) + extra = row.pop('request', None) or {} + for k in ('ext_ids', 'edit_extra'): + if k in extra: + row[k] = extra[k] + if 'release_ident' in extra: + row['fatcat'] = dict(release_ident=extra['release_ident']) + return row + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + req = transform(json.loads(l)) + print(json.dumps(req, sort_keys=True)) + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('json_file', + help="arabesque output file to use", + type=argparse.FileType('r')) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + +if __name__ == '__main__': + main() |