diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/ingest_tool.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/ingest_tool.py')
-rwxr-xr-x | python/ingest_tool.py | 71 |
1 files changed, 35 insertions, 36 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py index c0ef5aa..305c3a8 100755 --- a/python/ingest_tool.py +++ b/python/ingest_tool.py @@ -18,7 +18,9 @@ def run_single_ingest(args): ) if args.force_recrawl: request['force_recrawl'] = True - if request['ingest_type'] in ['dataset',]: + if request['ingest_type'] in [ + 'dataset', + ]: ingester = IngestFilesetWorker( try_spn2=not args.no_spn2, ingest_file_result_stdout=True, @@ -32,75 +34,71 @@ def run_single_ingest(args): print(json.dumps(result, sort_keys=True)) return result + def run_requests(args): # TODO: switch to using JsonLinePusher file_worker = IngestFileWorker( try_spn2=not args.no_spn2, html_quick_mode=args.html_quick_mode, ) - fileset_worker = IngestFilesetWorker( - try_spn2=not args.no_spn2, - ) + fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, ) for l in args.json_file: request = json.loads(l.strip()) - if request['ingest_type'] in ['dataset',]: + if request['ingest_type'] in [ + 'dataset', + ]: result = fileset_worker.process(request) else: result = file_worker.process(request) print(json.dumps(result, sort_keys=True)) + def run_api(args): port = 8083 print("Listening on localhost:{}".format(port)) server = HTTPServer(('', port), IngestFileRequestHandler) server.serve_forever() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) subparsers = parser.add_subparsers() - sub_single= subparsers.add_parser('single', - help="ingests a single base URL") + sub_single = subparsers.add_parser('single', help="ingests a single base URL") sub_single.set_defaults(func=run_single_ingest) sub_single.add_argument('ingest_type', - default="pdf", - help="type of ingest (pdf, html, etc)") + default="pdf", + help="type of ingest (pdf, html, etc)") sub_single.add_argument('--release-id', - help="(optional) existing release ident to match to") - sub_single.add_argument('--doi', - help="(optional) existing release DOI to match to") + help="(optional) existing release ident to match to") + sub_single.add_argument('--doi', help="(optional) existing release DOI to match to") sub_single.add_argument('--force-recrawl', - action='store_true', - help="ignore GWB history and use SPNv2 to re-crawl") - sub_single.add_argument('--no-spn2', - action='store_true', - help="don't use live web (SPNv2)") + action='store_true', + help="ignore GWB history and use SPNv2 to re-crawl") + sub_single.add_argument('--no-spn2', action='store_true', help="don't use live web (SPNv2)") sub_single.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") - sub_single.add_argument('url', - help="URL of paper to fetch") + action='store_true', + help="don't fetch individual sub-resources, just use CDX") + sub_single.add_argument('url', help="URL of paper to fetch") - sub_requests = subparsers.add_parser('requests', - help="takes a series of ingest requests (JSON, per line) and runs each") + sub_requests = subparsers.add_parser( + 'requests', help="takes a series of ingest requests (JSON, per line) and runs each") sub_requests.add_argument('--no-spn2', - action='store_true', - help="don't use live web (SPNv2)") + action='store_true', + help="don't use live web (SPNv2)") sub_requests.add_argument('--html-quick-mode', - action='store_true', - help="don't fetch individual sub-resources, just use CDX") + action='store_true', + help="don't fetch individual sub-resources, just use CDX") sub_requests.set_defaults(func=run_requests) sub_requests.add_argument('json_file', - help="JSON file (request per line) to import from (or stdin)", - default=sys.stdin, type=argparse.FileType('r')) + help="JSON file (request per line) to import from (or stdin)", + default=sys.stdin, + type=argparse.FileType('r')) - sub_api = subparsers.add_parser('api', - help="starts a simple HTTP server that processes ingest requests") + sub_api = subparsers.add_parser( + 'api', help="starts a simple HTTP server that processes ingest requests") sub_api.set_defaults(func=run_api) - sub_api.add_argument('--port', - help="HTTP port to listen on", - default=8033, type=int) + sub_api.add_argument('--port', help="HTTP port to listen on", default=8033, type=int) args = parser.parse_args() if not args.__dict__.get("func"): @@ -109,5 +107,6 @@ def main(): args.func(args) + if __name__ == '__main__': main() |