aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-24 16:32:52 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-24 16:33:18 -0800
commit1b2913491beabfc29f8ad802b9c3d23c9d44d8fa (patch)
treeeeff55eb4f8d24fa3dd8b1e2c808411cc10ed602 /python
parent1c538a10ac1f6e59ac56a6fbb9d583957e05f9f1 (diff)
downloadsandcrawler-1b2913491beabfc29f8ad802b9c3d23c9d44d8fa.tar.gz
sandcrawler-1b2913491beabfc29f8ad802b9c3d23c9d44d8fa.zip
basic arabesque2ingestrequest script
Diffstat (limited to 'python')
-rwxr-xr-xpython/scripts/arabesque2ingestrequest.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
new file mode 100755
index 0000000..5cafdcf
--- /dev/null
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python3
+
+"""
+This script is intended to be used for backfill ingest of old crawls. It can
+also be used as a fast path for getting freshly crawled content into fatcat if
+the crawl was a hit and the arabesque JSON was exported conservatively.
+
+Run like:
+
+ ./arabesque2ingestrequest.py example_arabesque.json --link-source pmc --extid-type pmcid > ingest_requests.json
+
+Can then run through requests using that tool, or dump into kafka queue.
+"""
+
+import sys
+import json
+import argparse
+
+
+def run(args):
+ for l in args.json_file:
+ if not l.strip():
+ continue
+ row = json.loads(l)
+ if not row['hit']:
+ continue
+
+ request = {
+ 'base_url': row['final_url'],
+ 'ingest_type': 'pdf',
+ 'link_source': args.link_source,
+ 'link_source_id': row['identifier'],
+ 'ingest_request_source': args.ingest_request_source,
+ 'ext_ids': {
+ args.extid_type: row['identifier'],
+ },
+ }
+ if args.release_stage:
+ assert args.release_stage in ('published', 'submitted', 'accepted', 'draft', 'update')
+ request['release_stage'] = args.release_stage
+
+ print("{}".format(json.dumps(request, sort_keys=True)))
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser.add_argument('--link-source',
+ required=True,
+ help="link_source to include in request")
+ parser.add_argument('--extid-type',
+ required=True,
+ help="extid to encode identifier as")
+ parser.add_argument('--ingest-request-source',
+ default="arabesque",
+ help="to include in request")
+ parser.add_argument('--release-stage',
+ default=None,
+ help="to include in request")
+ parser.add_argument('json_file',
+ help="arabesque output file to use",
+ type=argparse.FileType('r'))
+ subparsers = parser.add_subparsers()
+
+ args = parser.parse_args()
+
+ run(args)
+
+if __name__ == '__main__':
+ main()