aboutsummaryrefslogtreecommitdiffstats
path: root/python/ingest_tool.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/ingest_tool.py
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/ingest_tool.py')
-rwxr-xr-xpython/ingest_tool.py71
1 files changed, 35 insertions, 36 deletions
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index c0ef5aa..305c3a8 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -18,7 +18,9 @@ def run_single_ingest(args):
)
if args.force_recrawl:
request['force_recrawl'] = True
- if request['ingest_type'] in ['dataset',]:
+ if request['ingest_type'] in [
+ 'dataset',
+ ]:
ingester = IngestFilesetWorker(
try_spn2=not args.no_spn2,
ingest_file_result_stdout=True,
@@ -32,75 +34,71 @@ def run_single_ingest(args):
print(json.dumps(result, sort_keys=True))
return result
+
def run_requests(args):
# TODO: switch to using JsonLinePusher
file_worker = IngestFileWorker(
try_spn2=not args.no_spn2,
html_quick_mode=args.html_quick_mode,
)
- fileset_worker = IngestFilesetWorker(
- try_spn2=not args.no_spn2,
- )
+ fileset_worker = IngestFilesetWorker(try_spn2=not args.no_spn2, )
for l in args.json_file:
request = json.loads(l.strip())
- if request['ingest_type'] in ['dataset',]:
+ if request['ingest_type'] in [
+ 'dataset',
+ ]:
result = fileset_worker.process(request)
else:
result = file_worker.process(request)
print(json.dumps(result, sort_keys=True))
+
def run_api(args):
port = 8083
print("Listening on localhost:{}".format(port))
server = HTTPServer(('', port), IngestFileRequestHandler)
server.serve_forever()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
subparsers = parser.add_subparsers()
- sub_single= subparsers.add_parser('single',
- help="ingests a single base URL")
+ sub_single = subparsers.add_parser('single', help="ingests a single base URL")
sub_single.set_defaults(func=run_single_ingest)
sub_single.add_argument('ingest_type',
- default="pdf",
- help="type of ingest (pdf, html, etc)")
+ default="pdf",
+ help="type of ingest (pdf, html, etc)")
sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
+ help="(optional) existing release ident to match to")
+ sub_single.add_argument('--doi', help="(optional) existing release DOI to match to")
sub_single.add_argument('--force-recrawl',
- action='store_true',
- help="ignore GWB history and use SPNv2 to re-crawl")
- sub_single.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
+ action='store_true',
+ help="ignore GWB history and use SPNv2 to re-crawl")
+ sub_single.add_argument('--no-spn2', action='store_true', help="don't use live web (SPNv2)")
sub_single.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
+ sub_single.add_argument('url', help="URL of paper to fetch")
- sub_requests = subparsers.add_parser('requests',
- help="takes a series of ingest requests (JSON, per line) and runs each")
+ sub_requests = subparsers.add_parser(
+ 'requests', help="takes a series of ingest requests (JSON, per line) and runs each")
sub_requests.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
+ action='store_true',
+ help="don't use live web (SPNv2)")
sub_requests.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
+ action='store_true',
+ help="don't fetch individual sub-resources, just use CDX")
sub_requests.set_defaults(func=run_requests)
sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
+ help="JSON file (request per line) to import from (or stdin)",
+ default=sys.stdin,
+ type=argparse.FileType('r'))
- sub_api = subparsers.add_parser('api',
- help="starts a simple HTTP server that processes ingest requests")
+ sub_api = subparsers.add_parser(
+ 'api', help="starts a simple HTTP server that processes ingest requests")
sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
+ sub_api.add_argument('--port', help="HTTP port to listen on", default=8033, type=int)
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -109,5 +107,6 @@ def main():
args.func(args)
+
if __name__ == '__main__':
main()