diff options
Diffstat (limited to 'python/scripts/arabesque2ingestrequest.py')
-rwxr-xr-x | python/scripts/arabesque2ingestrequest.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py new file mode 100755 index 0000000..4561541 --- /dev/null +++ b/python/scripts/arabesque2ingestrequest.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +""" +This script is intended to be used for backfill ingest of old crawls. It can +also be used as a fast path for getting freshly crawled content into fatcat if +the crawl was a hit and the arabesque JSON was exported conservatively. + +Run like: + + ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json + +Can then run through requests using that tool, or dump into kafka queue. +""" + +import argparse +import json +import sys + + +def run(args): + for l in args.json_file: + if not l.strip(): + continue + row = json.loads(l) + if not row["hit"]: + continue + + request = { + "base_url": row["final_url"], + "ingest_type": args.ingest_type, + "link_source": args.link_source, + "link_source_id": row["identifier"], + "ingest_request_source": args.ingest_request_source, + "ext_ids": { + args.extid_type: row["identifier"], + }, + } + if args.release_stage: + assert args.release_stage in ( + "published", + "submitted", + "accepted", + "draft", + "update", + ) + request["release_stage"] = args.release_stage + + print("{}".format(json.dumps(request, sort_keys=True))) + + +def main(): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "--link-source", required=True, help="link_source to include in request" + ) + parser.add_argument("--extid-type", required=True, help="extid to encode identifier as") + parser.add_argument( + "--ingest-type", default="pdf", help="ingest type (pdf, html, xml, etc)" + ) + parser.add_argument( + "--ingest-request-source", default="arabesque", help="to include in request" + ) + parser.add_argument("--release-stage", default=None, help="to include in request") + parser.add_argument( + "json_file", help="arabesque output file to use", type=argparse.FileType("r") + ) + subparsers = parser.add_subparsers() + + args = parser.parse_args() + + run(args) + + +if __name__ == "__main__": + main() |