aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/ingestrequest_row2json.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/scripts/ingestrequest_row2json.py')
-rwxr-xr-xpython/scripts/ingestrequest_row2json.py59
1 files changed, 59 insertions, 0 deletions
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
new file mode 100755
index 0000000..8a353ca
--- /dev/null
+++ b/python/scripts/ingestrequest_row2json.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+This script is used to turn ingest request postgres rows (in JSON export
+format) back in to regular ingest request JSON.
+
+The only difference is the name and location of some optional keys.
+"""
+
+import argparse
+import json
+import sys
+
+
+def transform(row):
+ """
+ dict-to-dict
+ """
+ row.pop("created", None)
+ extra = row.pop("request", None) or {}
+ for k in ("ext_ids", "edit_extra"):
+ if k in extra:
+ row[k] = extra[k]
+ if "release_ident" in extra:
+ row["fatcat"] = dict(release_ident=extra["release_ident"])
+ return row
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ try:
+ req = transform(json.loads(l))
+ except:
+ print(l, file=sys.stderr)
+ if args.force_recrawl:
+ req["force_recrawl"] = True
+ print(json.dumps(req, sort_keys=True))
+
+
+def main():
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument(
+ "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
+ )
+ parser.add_argument(
+ "--force-recrawl",
+ action="store_true",
+ help="whether to add recrawl (SPNv2) flag to request",
+ )
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+
+if __name__ == "__main__":
+ main()