aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/ingestrequest_row2json.py
blob: 8a353ca5dcef3af0e6ddc9f52b45ecb8fb934d0d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
"""
This script is used to turn ingest request postgres rows (in JSON export
format) back in to regular ingest request JSON.

The only difference is the name and location of some optional keys.
"""

import argparse
import json
import sys


def transform(row):
    """
    dict-to-dict
    """
    row.pop("created", None)
    extra = row.pop("request", None) or {}
    for k in ("ext_ids", "edit_extra"):
        if k in extra:
            row[k] = extra[k]
    if "release_ident" in extra:
        row["fatcat"] = dict(release_ident=extra["release_ident"])
    return row


def run(args):
    for l in args.json_file:
        if not l.strip():
            continue
        try:
            req = transform(json.loads(l))
        except:
            print(l, file=sys.stderr)
        if args.force_recrawl:
            req["force_recrawl"] = True
        print(json.dumps(req, sort_keys=True))


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "json_file", help="SQL output JSON file to process", type=argparse.FileType("r")
    )
    parser.add_argument(
        "--force-recrawl",
        action="store_true",
        help="whether to add recrawl (SPNv2) flag to request",
    )
    subparsers = parser.add_subparsers()

    args = parser.parse_args()

    run(args)


if __name__ == "__main__":
    main()