aboutsummaryrefslogtreecommitdiffstats
path: root/python/ingest_file.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/ingest_file.py')
-rwxr-xr-xpython/ingest_file.py100
1 files changed, 0 insertions, 100 deletions
diff --git a/python/ingest_file.py b/python/ingest_file.py
deleted file mode 100755
index 20b6d67..0000000
--- a/python/ingest_file.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-import argparse
-
-from http.server import HTTPServer
-from sandcrawler.ingest import IngestFileRequestHandler, IngestFileWorker
-
-
-def run_single_ingest(args):
- request = dict(
- ingest_type=args.ingest_type,
- base_url=args.url,
- ext_ids=dict(doi=args.doi),
- fatcat=dict(release_ident=args.release_id),
- )
- if args.force_recrawl:
- request['force_recrawl'] = True
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
- return result
-
-def run_requests(args):
- # TODO: switch to using JsonLinePusher
- ingester = IngestFileWorker(
- try_spn2=not args.no_spn2,
- html_quick_mode=args.html_quick_mode,
- )
- for l in args.json_file:
- request = json.loads(l.strip())
- result = ingester.process(request)
- print(json.dumps(result, sort_keys=True))
-
-def run_api(args):
- port = 8083
- print("Listening on localhost:{}".format(port))
- server = HTTPServer(('', port), IngestFileRequestHandler)
- server.serve_forever()
-
-def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
- subparsers = parser.add_subparsers()
-
- sub_single= subparsers.add_parser('single',
- help="ingests a single file URL")
- sub_single.set_defaults(func=run_single_ingest)
- sub_single.add_argument('--release-id',
- help="(optional) existing release ident to match to")
- sub_single.add_argument('--doi',
- help="(optional) existing release DOI to match to")
- sub_single.add_argument('--force-recrawl',
- action='store_true',
- help="ignore GWB history and use SPNv2 to re-crawl")
- sub_single.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_single.add_argument('--ingest-type',
- default="pdf",
- help="type of ingest (pdf, html, etc)")
- sub_single.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_single.add_argument('url',
- help="URL of paper to fetch")
-
- sub_requests = subparsers.add_parser('requests',
- help="takes a series of ingest requests (JSON, per line) and runs each")
- sub_requests.add_argument('--no-spn2',
- action='store_true',
- help="don't use live web (SPNv2)")
- sub_requests.add_argument('--html-quick-mode',
- action='store_true',
- help="don't fetch individual sub-resources, just use CDX")
- sub_requests.set_defaults(func=run_requests)
- sub_requests.add_argument('json_file',
- help="JSON file (request per line) to import from (or stdin)",
- default=sys.stdin, type=argparse.FileType('r'))
-
- sub_api = subparsers.add_parser('api',
- help="starts a simple HTTP server that processes ingest requests")
- sub_api.set_defaults(func=run_api)
- sub_api.add_argument('--port',
- help="HTTP port to listen on",
- default=8033, type=int)
-
- args = parser.parse_args()
- if not args.__dict__.get("func"):
- parser.print_help(file=sys.stderr)
- sys.exit(-1)
-
- args.func(args)
-
-if __name__ == '__main__':
- main()