aboutsummaryrefslogtreecommitdiffstats
path: root/python/persist_tool.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/persist_tool.py')
-rwxr-xr-xpython/persist_tool.py125
1 files changed, 62 insertions, 63 deletions
diff --git a/python/persist_tool.py b/python/persist_tool.py
index d52f7c1..9160db6 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).
@@ -16,9 +15,7 @@ from sandcrawler.persist import *
def run_cdx(args):
- worker = PersistCdxWorker(
- db_url=args.db_url,
- )
+ worker = PersistCdxWorker(db_url=args.db_url, )
filter_mimetypes = ['application/pdf']
if args.no_mimetype_filter:
filter_mimetypes = None
@@ -32,6 +29,7 @@ def run_cdx(args):
)
pusher.run()
+
def run_grobid(args):
worker = PersistGrobidWorker(
db_url=args.db_url,
@@ -49,24 +47,22 @@ def run_grobid(args):
)
pusher.run()
+
def run_grobid_disk(args):
"""
Writes XML to individual files on disk, and also prints non-XML metadata to
stdout as JSON, which can be redirected to a separate file.
"""
- worker = PersistGrobidDiskWorker(
- output_dir=args.output_dir,
- )
+ worker = PersistGrobidDiskWorker(output_dir=args.output_dir, )
pusher = JsonLinePusher(
worker,
args.json_file,
)
pusher.run()
+
def run_pdftrio(args):
- worker = PersistPdfTrioWorker(
- db_url=args.db_url,
- )
+ worker = PersistPdfTrioWorker(db_url=args.db_url, )
pusher = JsonLinePusher(
worker,
args.json_file,
@@ -74,6 +70,7 @@ def run_pdftrio(args):
)
pusher.run()
+
def run_pdftext(args):
worker = PersistPdfTextWorker(
db_url=args.db_url,
@@ -91,10 +88,9 @@ def run_pdftext(args):
)
pusher.run()
+
def run_ingest_file_result(args):
- worker = PersistIngestFileResultWorker(
- db_url=args.db_url,
- )
+ worker = PersistIngestFileResultWorker(db_url=args.db_url, )
pusher = JsonLinePusher(
worker,
args.json_file,
@@ -102,10 +98,9 @@ def run_ingest_file_result(args):
)
pusher.run()
+
def run_ingest_request(args):
- worker = PersistIngestRequestWorker(
- db_url=args.db_url,
- )
+ worker = PersistIngestRequestWorker(db_url=args.db_url, )
pusher = JsonLinePusher(
worker,
args.json_file,
@@ -113,92 +108,95 @@ def run_ingest_request(args):
)
pusher.run()
+
def main():
- parser = argparse.ArgumentParser(
- formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--db-url',
- help="postgresql database connection string",
- default="postgres:///sandcrawler")
- parser.add_argument('--s3-url',
- help="S3 (seaweedfs) backend URL",
- default="localhost:9000")
+ help="postgresql database connection string",
+ default="postgres:///sandcrawler")
+ parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000")
parser.add_argument('--s3-access-key',
- help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY')
+ or os.environ.get('MINIO_ACCESS_KEY'))
parser.add_argument('--s3-secret-key',
- help="S3 (seaweedfs) credential",
- default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))
+ help="S3 (seaweedfs) credential",
+ default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY')
+ or os.environ.get('MINIO_SECRET_KEY'))
parser.add_argument('--s3-bucket',
- help="S3 (seaweedfs) bucket to persist into",
- default="sandcrawler-dev")
+ help="S3 (seaweedfs) bucket to persist into",
+ default="sandcrawler-dev")
subparsers = parser.add_subparsers()
- sub_cdx = subparsers.add_parser('cdx',
- help="backfill a CDX file into postgresql cdx table")
+ sub_cdx = subparsers.add_parser('cdx', help="backfill a CDX file into postgresql cdx table")
sub_cdx.set_defaults(func=run_cdx)
sub_cdx.add_argument('cdx_file',
- help="CDX file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_cdx.add_argument('--no-mimetype-filter',
+ help="CDX file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_cdx.add_argument(
+ '--no-mimetype-filter',
action='store_true',
help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")
- sub_grobid = subparsers.add_parser('grobid',
- help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
+ sub_grobid = subparsers.add_parser(
+ 'grobid', help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_grobid.set_defaults(func=run_grobid)
sub_grobid.add_argument('json_file',
- help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
sub_grobid.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_grobid.add_argument('--db-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_grobid.add_argument(
+ '--db-only',
action='store_true',
help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
- sub_pdftext = subparsers.add_parser('pdftext',
+ sub_pdftext = subparsers.add_parser(
+ 'pdftext',
help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_pdftext.set_defaults(func=run_pdftext)
sub_pdftext.add_argument('json_file',
- help="pdftext file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ help="pdftext file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
sub_pdftext.add_argument('--s3-only',
- action='store_true',
- help="only upload TEI-XML to S3 (don't write to database)")
- sub_pdftext.add_argument('--db-only',
+ action='store_true',
+ help="only upload TEI-XML to S3 (don't write to database)")
+ sub_pdftext.add_argument(
+ '--db-only',
action='store_true',
help="only write status to sandcrawler-db (don't save TEI-XML to S3)")
sub_grobid_disk = subparsers.add_parser('grobid-disk',
- help="dump GRBOID output to (local) files on disk")
+ help="dump GRBOID output to (local) files on disk")
sub_grobid_disk.set_defaults(func=run_grobid_disk)
sub_grobid_disk.add_argument('json_file',
- help="grobid file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
- sub_grobid_disk.add_argument('output_dir',
- help="base directory to output into",
- type=str)
+ help="grobid file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
+ sub_grobid_disk.add_argument('output_dir', help="base directory to output into", type=str)
- sub_pdftrio = subparsers.add_parser('pdftrio',
+ sub_pdftrio = subparsers.add_parser(
+ 'pdftrio',
help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")
sub_pdftrio.set_defaults(func=run_pdftrio)
sub_pdftrio.add_argument('json_file',
- help="pdftrio file to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ help="pdftrio file to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
- sub_ingest_file_result = subparsers.add_parser('ingest-file-result',
- help="backfill a ingest_file_result JSON dump into postgresql")
+ sub_ingest_file_result = subparsers.add_parser(
+ 'ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql")
sub_ingest_file_result.set_defaults(func=run_ingest_file_result)
- sub_ingest_file_result.add_argument('json_file',
+ sub_ingest_file_result.add_argument(
+ 'json_file',
help="ingest_file_result file to import from (or '-' for stdin)",
type=argparse.FileType('r'))
- sub_ingest_request = subparsers.add_parser('ingest-request',
- help="backfill a ingest_request JSON dump into postgresql")
+ sub_ingest_request = subparsers.add_parser(
+ 'ingest-request', help="backfill a ingest_request JSON dump into postgresql")
sub_ingest_request.set_defaults(func=run_ingest_request)
sub_ingest_request.add_argument('json_file',
- help="ingest_request to import from (or '-' for stdin)",
- type=argparse.FileType('r'))
+ help="ingest_request to import from (or '-' for stdin)",
+ type=argparse.FileType('r'))
args = parser.parse_args()
if not args.__dict__.get("func"):
@@ -207,5 +205,6 @@ def main():
args.func(args)
+
if __name__ == '__main__':
main()