From 05bd7cbcc62588e431c5efd533189e246b2a997e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:54:37 -0700 Subject: make fmt --- python/persist_tool.py | 125 ++++++++++++++++++++++++------------------------- 1 file changed, 62 insertions(+), 63 deletions(-) (limited to 'python/persist_tool.py') diff --git a/python/persist_tool.py b/python/persist_tool.py index d52f7c1..9160db6 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs). @@ -16,9 +15,7 @@ from sandcrawler.persist import * def run_cdx(args): - worker = PersistCdxWorker( - db_url=args.db_url, - ) + worker = PersistCdxWorker(db_url=args.db_url, ) filter_mimetypes = ['application/pdf'] if args.no_mimetype_filter: filter_mimetypes = None @@ -32,6 +29,7 @@ def run_cdx(args): ) pusher.run() + def run_grobid(args): worker = PersistGrobidWorker( db_url=args.db_url, @@ -49,24 +47,22 @@ def run_grobid(args): ) pusher.run() + def run_grobid_disk(args): """ Writes XML to individual files on disk, and also prints non-XML metadata to stdout as JSON, which can be redirected to a separate file. """ - worker = PersistGrobidDiskWorker( - output_dir=args.output_dir, - ) + worker = PersistGrobidDiskWorker(output_dir=args.output_dir, ) pusher = JsonLinePusher( worker, args.json_file, ) pusher.run() + def run_pdftrio(args): - worker = PersistPdfTrioWorker( - db_url=args.db_url, - ) + worker = PersistPdfTrioWorker(db_url=args.db_url, ) pusher = JsonLinePusher( worker, args.json_file, @@ -74,6 +70,7 @@ def run_pdftrio(args): ) pusher.run() + def run_pdftext(args): worker = PersistPdfTextWorker( db_url=args.db_url, @@ -91,10 +88,9 @@ def run_pdftext(args): ) pusher.run() + def run_ingest_file_result(args): - worker = PersistIngestFileResultWorker( - db_url=args.db_url, - ) + worker = PersistIngestFileResultWorker(db_url=args.db_url, ) pusher = JsonLinePusher( worker, args.json_file, @@ -102,10 +98,9 @@ def run_ingest_file_result(args): ) pusher.run() + def run_ingest_request(args): - worker = PersistIngestRequestWorker( - db_url=args.db_url, - ) + worker = PersistIngestRequestWorker(db_url=args.db_url, ) pusher = JsonLinePusher( worker, args.json_file, @@ -113,92 +108,95 @@ def run_ingest_request(args): ) pusher.run() + def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--db-url', - help="postgresql database connection string", - default="postgres:///sandcrawler") - parser.add_argument('--s3-url', - help="S3 (seaweedfs) backend URL", - default="localhost:9000") + help="postgresql database connection string", + default="postgres:///sandcrawler") + parser.add_argument('--s3-url', help="S3 (seaweedfs) backend URL", default="localhost:9000") parser.add_argument('--s3-access-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY')) + help="S3 (seaweedfs) credential", + default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') + or os.environ.get('MINIO_ACCESS_KEY')) parser.add_argument('--s3-secret-key', - help="S3 (seaweedfs) credential", - default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY')) + help="S3 (seaweedfs) credential", + default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') + or os.environ.get('MINIO_SECRET_KEY')) parser.add_argument('--s3-bucket', - help="S3 (seaweedfs) bucket to persist into", - default="sandcrawler-dev") + help="S3 (seaweedfs) bucket to persist into", + default="sandcrawler-dev") subparsers = parser.add_subparsers() - sub_cdx = subparsers.add_parser('cdx', - help="backfill a CDX file into postgresql cdx table") + sub_cdx = subparsers.add_parser('cdx', help="backfill a CDX file into postgresql cdx table") sub_cdx.set_defaults(func=run_cdx) sub_cdx.add_argument('cdx_file', - help="CDX file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - sub_cdx.add_argument('--no-mimetype-filter', + help="CDX file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_cdx.add_argument( + '--no-mimetype-filter', action='store_true', help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)") - sub_grobid = subparsers.add_parser('grobid', - help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)") + sub_grobid = subparsers.add_parser( + 'grobid', help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)") sub_grobid.set_defaults(func=run_grobid) sub_grobid.add_argument('json_file', - help="grobid file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="grobid file to import from (or '-' for stdin)", + type=argparse.FileType('r')) sub_grobid.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") - sub_grobid.add_argument('--db-only', + action='store_true', + help="only upload TEI-XML to S3 (don't write to database)") + sub_grobid.add_argument( + '--db-only', action='store_true', help="only write status to sandcrawler-db (don't save TEI-XML to S3)") - sub_pdftext = subparsers.add_parser('pdftext', + sub_pdftext = subparsers.add_parser( + 'pdftext', help="backfill a pdftext JSON ('pg') dump into postgresql and s3 (seaweedfs)") sub_pdftext.set_defaults(func=run_pdftext) sub_pdftext.add_argument('json_file', - help="pdftext file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="pdftext file to import from (or '-' for stdin)", + type=argparse.FileType('r')) sub_pdftext.add_argument('--s3-only', - action='store_true', - help="only upload TEI-XML to S3 (don't write to database)") - sub_pdftext.add_argument('--db-only', + action='store_true', + help="only upload TEI-XML to S3 (don't write to database)") + sub_pdftext.add_argument( + '--db-only', action='store_true', help="only write status to sandcrawler-db (don't save TEI-XML to S3)") sub_grobid_disk = subparsers.add_parser('grobid-disk', - help="dump GRBOID output to (local) files on disk") + help="dump GRBOID output to (local) files on disk") sub_grobid_disk.set_defaults(func=run_grobid_disk) sub_grobid_disk.add_argument('json_file', - help="grobid file to import from (or '-' for stdin)", - type=argparse.FileType('r')) - sub_grobid_disk.add_argument('output_dir', - help="base directory to output into", - type=str) + help="grobid file to import from (or '-' for stdin)", + type=argparse.FileType('r')) + sub_grobid_disk.add_argument('output_dir', help="base directory to output into", type=str) - sub_pdftrio = subparsers.add_parser('pdftrio', + sub_pdftrio = subparsers.add_parser( + 'pdftrio', help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)") sub_pdftrio.set_defaults(func=run_pdftrio) sub_pdftrio.add_argument('json_file', - help="pdftrio file to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="pdftrio file to import from (or '-' for stdin)", + type=argparse.FileType('r')) - sub_ingest_file_result = subparsers.add_parser('ingest-file-result', - help="backfill a ingest_file_result JSON dump into postgresql") + sub_ingest_file_result = subparsers.add_parser( + 'ingest-file-result', help="backfill a ingest_file_result JSON dump into postgresql") sub_ingest_file_result.set_defaults(func=run_ingest_file_result) - sub_ingest_file_result.add_argument('json_file', + sub_ingest_file_result.add_argument( + 'json_file', help="ingest_file_result file to import from (or '-' for stdin)", type=argparse.FileType('r')) - sub_ingest_request = subparsers.add_parser('ingest-request', - help="backfill a ingest_request JSON dump into postgresql") + sub_ingest_request = subparsers.add_parser( + 'ingest-request', help="backfill a ingest_request JSON dump into postgresql") sub_ingest_request.set_defaults(func=run_ingest_request) sub_ingest_request.add_argument('json_file', - help="ingest_request to import from (or '-' for stdin)", - type=argparse.FileType('r')) + help="ingest_request to import from (or '-' for stdin)", + type=argparse.FileType('r')) args = parser.parse_args() if not args.__dict__.get("func"): @@ -207,5 +205,6 @@ def main(): args.func(args) + if __name__ == '__main__': main() -- cgit v1.2.3