aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-05 23:24:10 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-05 23:24:10 -0800
commita38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d (patch)
tree207cfe78c9bd50bd7124f4e95383ed46cc37c094 /python
parentd4233fe651043b6bb8175bb0d22fbab95b11fb70 (diff)
downloadsandcrawler-a38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d.tar.gz
sandcrawler-a38da6b326cdcb5ed950ca6dfe1cdbd6d72d7d7d.zip
add ingestrequest_row2json.py
Diffstat (limited to 'python')
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py48
1 files changed, 48 insertions, 0 deletions
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
new file mode 100755
index 0000000..065dd3b
--- /dev/null
+++ b/python/scripts/ingestrequest_row2json.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+"""
+This script is used to turn ingest request postgres rows (in JSON export
+format) back in to regular ingest request JSON.
+
+The only difference is the name and location of some optional keys.
+"""
+
+import sys
+import json
+import argparse
+
+
+def transform(row):
+ """
+ dict-to-dict
+ """
+ row.pop('created', None)
+ extra = row.pop('request', None) or {}
+ for k in ('ext_ids', 'edit_extra'):
+ if k in extra:
+ row[k] = extra[k]
+ if 'release_ident' in extra:
+ row['fatcat'] = dict(release_ident=extra['release_ident'])
+ return row
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ req = transform(json.loads(l))
+ print(json.dumps(req, sort_keys=True))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()