From 7430ddbcdec76091220de474060b968f0ef1bb70 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Sep 2021 15:08:47 -0700 Subject: rename some python files for clarity --- python/ingest_file.py | 100 -------------------------------------------------- 1 file changed, 100 deletions(-) delete mode 100755 python/ingest_file.py (limited to 'python/ingest_file.py') diff --git a/python/ingest_file.py b/python/ingest_file.py deleted file mode 100755 index 20b6d67..0000000 --- a/python/ingest_file.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import argparse - -from http.server import HTTPServer -from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker - - -def run_single_ingest(args): - request = dict( - ingest_type=args.ingest_type, - base_url=args.url, - ext_ids=dict(doi=args.doi), - fatcat=dict(release_ident=args.release_id), - ) - if args.force_recrawl: - request['force_recrawl'] = True - ingester = IngestFileWorker( - try_spn2=not args.no_spn2, - html_quick_mode=args.html_quick_mode, - ) - result = ingester.process(request) - print(json.dumps(result, sort_keys=True)) - return result - -def run_requests(args): - # TODO: switch to using JsonLinePusher - ingester = IngestFileWorker( - try_spn2=not args.no_spn2, - html_quick_mode=args.html_quick_mode, - ) - for l in args.json_file: - request = json.loads(l.strip()) - result = ingester.process(request) - print(json.dumps(result, sort_keys=True)) - -def run_api(args): - port = 8083 - print("Listening on localhost:{}".format(port)) - server = HTTPServer(('', port), IngestFileRequestHandler) - server.serve_forever() - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - subparsers = parser.add_subparsers() - - sub_single= subparsers.add_parser('single', - help="ingests a single file URL") - sub_single.set_defaults(func=run_single_ingest) - sub_single.add_argument('--release-id', - help="(optional) existing release ident to match to") - sub_single.add_argument('--doi', - help="(optional) existing release DOI to match to") - sub_single.add_argument('--force-recrawl', - action='store_true', - help="ignore GWB history and use SPNv2 to re-crawl") - sub_single.add_argument('--no-spn2', - action='store_true', - help="don't use live web (SPNv2)") - sub_single.add_argument('--ingest-type', - default="pdf", - help="type of ingest (pdf, html, etc)") - sub_single.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") - sub_single.add_argument('url', - help="URL of paper to fetch") - - sub_requests = subparsers.add_parser('requests', - help="takes a series of ingest requests (JSON, per line) and runs each") - sub_requests.add_argument('--no-spn2', - action='store_true', - help="don't use live web (SPNv2)") - sub_requests.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") - sub_requests.set_defaults(func=run_requests) - sub_requests.add_argument('json_file', - help="JSON file (request per line) to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) - - sub_api = subparsers.add_parser('api', - help="starts a simple HTTP server that processes ingest requests") - sub_api.set_defaults(func=run_api) - sub_api.add_argument('--port', - help="HTTP port to listen on", - default=8033, type=int) - - args = parser.parse_args() - if not args.__dict__.get("func"): - parser.print_help(file=sys.stderr) - sys.exit(-1) - - args.func(args) - -if __name__ == '__main__': - main() -- cgit v1.2.3