aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts/arabesque2ingestrequest.py
blob: 45615415b7446b3d6a111a856163615387e8fbc4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
"""
This script is intended to be used for backfill ingest of old crawls. It can
also be used as a fast path for getting freshly crawled content into fatcat if
the crawl was a hit and the arabesque JSON was exported conservatively.

Run like:

    ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json

Can then run through requests using that tool, or dump into kafka queue.
"""

import argparse
import json
import sys


def run(args):
    for l in args.json_file:
        if not l.strip():
            continue
        row = json.loads(l)
        if not row["hit"]:
            continue

        request = {
            "base_url": row["final_url"],
            "ingest_type": args.ingest_type,
            "link_source": args.link_source,
            "link_source_id": row["identifier"],
            "ingest_request_source": args.ingest_request_source,
            "ext_ids": {
                args.extid_type: row["identifier"],
            },
        }
        if args.release_stage:
            assert args.release_stage in (
                "published",
                "submitted",
                "accepted",
                "draft",
                "update",
            )
            request["release_stage"] = args.release_stage

        print("{}".format(json.dumps(request, sort_keys=True)))


def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--link-source", required=True, help="link_source to include in request"
    )
    parser.add_argument("--extid-type", required=True, help="extid to encode identifier as")
    parser.add_argument(
        "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)"
    )
    parser.add_argument(
        "--ingest-request-source", default="arabesque", help="to include in request"
    )
    parser.add_argument("--release-stage", default=None, help="to include in request")
    parser.add_argument(
        "json_file", help="arabesque output file to use", type=argparse.FileType("r")
    )
    subparsers = parser.add_subparsers()

    args = parser.parse_args()

    run(args)


if __name__ == "__main__":
    main()