aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid2json.py1
-rwxr-xr-xpython/grobid_tool.py12
-rwxr-xr-xpython/ingest_file.py12
-rwxr-xr-xpython/kafka_grobid.py3
-rwxr-xr-xpython/sandcrawler_worker.py12
5 files changed, 27 insertions, 13 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index 75fdcba..977c772 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -163,6 +163,7 @@ def teixml2json(content, encumbered=True):
def main(): # pragma no cover
parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="GROBID TEI XML to JSON",
usage="%(prog)s [options] <teifile>...")
parser.add_argument("--no-encumbered",
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index f21d088..a8d5120 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -50,7 +50,8 @@ def run_extract_zipfile(args):
pusher.run()
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--kafka-mode',
action='store_true',
help="send output to Kafka (not stdout)")
@@ -68,19 +69,22 @@ def main():
help="GROBID API host/port")
subparsers = parser.add_subparsers()
- sub_extract_json = subparsers.add_parser('extract-json')
+ sub_extract_json = subparsers.add_parser('extract-json',
+ help="for each JSON line with CDX info, fetches PDF and does GROBID extraction")
sub_extract_json.set_defaults(func=run_extract_json)
sub_extract_json.add_argument('json_file',
help="JSON file to import from (or '-' for stdin)",
type=argparse.FileType('r'))
- sub_extract_cdx = subparsers.add_parser('extract-cdx')
+ sub_extract_cdx = subparsers.add_parser('extract-cdx',
+ help="for each CDX line, fetches PDF and does GROBID extraction")
sub_extract_cdx.set_defaults(func=run_extract_cdx)
sub_extract_cdx.add_argument('cdx_file',
help="CDX file to import from (or '-' for stdin)",
type=argparse.FileType('r'))
- sub_extract_zipfile = subparsers.add_parser('extract-zipfile')
+ sub_extract_zipfile = subparsers.add_parser('extract-zipfile',
+ help="opens zipfile, iterates over PDF files inside and does GROBID extract for each")
sub_extract_zipfile.set_defaults(func=run_extract_zipfile)
sub_extract_zipfile.add_argument('zip_file',
help="zipfile with PDFs to extract",
diff --git a/python/ingest_file.py b/python/ingest_file.py
index fcd2e94..1980e3d 100755
--- a/python/ingest_file.py
+++ b/python/ingest_file.py
@@ -34,13 +34,15 @@ def run_api(args):
server.serve_forever()
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--api-host-url',
default="http://localhost:9411/v0",
help="fatcat API host/port to use")
subparsers = parser.add_subparsers()
- sub_single= subparsers.add_parser('single')
+ sub_single= subparsers.add_parser('single',
+ help="ingests a single file URL")
sub_single.set_defaults(func=run_single_ingest)
sub_single.add_argument('--release-id',
help="(optional) existing release ident to match to")
@@ -49,13 +51,15 @@ def main():
sub_single.add_argument('url',
help="URL of paper to fetch")
- sub_requests = subparsers.add_parser('requests')
+ sub_requests = subparsers.add_parser('requests',
+ help="takes a series of ingest requests (JSON, per line) and runs each")
sub_requests.set_defaults(func=run_requests)
sub_requests.add_argument('json_file',
help="JSON file (request per line) to import from (or stdin)",
default=sys.stdin, type=argparse.FileType('r'))
- sub_api = subparsers.add_parser('api')
+ sub_api = subparsers.add_parser('api',
+ help="starts a simple HTTP server that processes ingest requests")
sub_api.set_defaults(func=run_api)
sub_api.add_argument('--port',
help="HTTP port to listen on",
diff --git a/python/kafka_grobid.py b/python/kafka_grobid.py
index 8d1bd2e..dd6ab63 100755
--- a/python/kafka_grobid.py
+++ b/python/kafka_grobid.py
@@ -295,7 +295,8 @@ class KafkaGrobidWorker:
@sentry_client.capture_exceptions
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--kafka-hosts',
default="localhost:9092",
help="list of Kafka brokers (host/port) to use")
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 81aef5b..f314218 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -47,7 +47,8 @@ def run_ingest_file(args):
pusher.run()
def main():
- parser = argparse.ArgumentParser()
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--kafka-hosts',
default="localhost:9092",
help="list of Kafka brokers (host/port) to use")
@@ -59,13 +60,16 @@ def main():
help="GROBID API host/port")
subparsers = parser.add_subparsers()
- sub_grobid_extract = subparsers.add_parser('grobid-extract')
+ sub_grobid_extract = subparsers.add_parser('grobid-extract',
+ help="daemon that consumes CDX JSON objects from Kafka, extracts, pushes to Kafka")
sub_grobid_extract.set_defaults(func=run_grobid_extract)
- sub_grobid_persist = subparsers.add_parser('grobid-persist')
+ sub_grobid_persist = subparsers.add_parser('grobid-persist',
+ help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres")
sub_grobid_persist.set_defaults(func=run_grobid_persist)
- sub_ingest_file = subparsers.add_parser('ingest-file')
+ sub_ingest_file = subparsers.add_parser('ingest-file',
+ help="daemon that consumes requests from Kafka, ingests, pushes results to Kafka")
sub_ingest_file.set_defaults(func=run_ingest_file)
args = parser.parse_args()