diff options
| -rw-r--r-- | python/example.env | 4 | ||||
| -rwxr-xr-x | python/persist_tool.py | 18 | ||||
| -rw-r--r-- | python/sandcrawler/minio.py | 4 | ||||
| -rw-r--r-- | python/sandcrawler/persist.py | 6 | ||||
| -rwxr-xr-x | python/sandcrawler_worker.py | 20 | 
5 files changed, 27 insertions, 25 deletions
diff --git a/python/example.env b/python/example.env index 4d3baa0..5064c96 100644 --- a/python/example.env +++ b/python/example.env @@ -1,5 +1,5 @@ -MINIO_ACCESS_KEY="minioadmin" -MINIO_SECRET_KEY="minioadmin" +SANDCRAWLER_BLOB_ACCESS_KEY="minioadmin" +SANDCRAWLER_BLOB_SECRET_KEY="minioadmin"  IA_ACCESS_KEY="dummy"  IA_SECRET_KEY="dummy"  CDX_AUTH_TOKEN="dummy" diff --git a/python/persist_tool.py b/python/persist_tool.py index 66e02aa..69e9374 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -1,7 +1,7 @@  #!/usr/bin/env python3  """ -Commands for backfilling content from bulk files into postgresql and s3 (minio). +Commands for backfilling content from bulk files into postgresql and s3 (seaweedfs).  Normally this is done by workers (in sandcrawler_worker.py) consuming from  Kafka feeds, but sometimes we have bulk processing output we want to backfill. @@ -120,16 +120,16 @@ def main():          help="postgresql database connection string",          default="postgres:///sandcrawler")      parser.add_argument('--s3-url', -        help="S3 (minio) backend URL", +        help="S3 (seaweedfs) backend URL",          default="localhost:9000")      parser.add_argument('--s3-access-key', -        help="S3 (minio) credential", -        default=os.environ.get('MINIO_ACCESS_KEY')) +        help="S3 (seaweedfs) credential", +        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))      parser.add_argument('--s3-secret-key', -        help="S3 (minio) credential", -        default=os.environ.get('MINIO_SECRET_KEY')) +        help="S3 (seaweedfs) credential", +        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_SECRET_KEY'))      parser.add_argument('--s3-bucket', -        help="S3 (minio) bucket to persist into", +        help="S3 (seaweedfs) bucket to persist into",          default="sandcrawler-dev")      subparsers = parser.add_subparsers() @@ -144,7 +144,7 @@ def main():          help="ignore mimetype filtering; insert all content types (eg, assuming pre-filtered)")      sub_grobid = subparsers.add_parser('grobid', -        help="backfill a grobid JSON ('pg') dump into postgresql and s3 (minio)") +        help="backfill a grobid JSON ('pg') dump into postgresql and s3 (seaweedfs)")      sub_grobid.set_defaults(func=run_grobid)      sub_grobid.add_argument('json_file',          help="grobid file to import from (or '-' for stdin)", @@ -180,7 +180,7 @@ def main():          type=str)      sub_pdftrio = subparsers.add_parser('pdftrio', -        help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (minio)") +        help="backfill a pdftrio JSON ('pg') dump into postgresql and s3 (seaweedfs)")      sub_pdftrio.set_defaults(func=run_pdftrio)      sub_pdftrio.add_argument('json_file',          help="pdftrio file to import from (or '-' for stdin)", diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index 8b02211..c7deea1 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -17,8 +17,8 @@ class SandcrawlerMinioClient(object):          Example config:              host="localhost:9000", -            access_key=os.environ['MINIO_ACCESS_KEY'], -            secret_key=os.environ['MINIO_SECRET_KEY'], +            access_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'], +            secret_key=os.environ['SANDCRAWLER_BLOB_ACCESS_KEY'],          """          self.mc = minio.Minio(              host_url, diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index fbc5273..aa05195 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -452,9 +452,11 @@ class PersistPdfTextWorker(SandcrawlerWorker):  class PersistThumbnailWorker(SandcrawlerWorker):      """ -    Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL table. +    Pushes text file to blob store (S3/seaweed/minio) and PDF metadata to SQL +    table. -    This worker *must* be used with raw kakfa mode. +    This worker *must* be used with raw kakfa mode; thumbnails are *not* +    wrapped in JSON like most sandcrawler kafka messages.      """      def __init__(self, **kwargs): diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py index a653771..537398e 100755 --- a/python/sandcrawler_worker.py +++ b/python/sandcrawler_worker.py @@ -3,7 +3,7 @@  """  These are generally for continuously running workers that consume from Kafka.  Outputs might either be pushed back into Kafka, or directly into sandcrawler-db -or minio. +or S3 (SeaweedFS).  """  import os @@ -242,16 +242,16 @@ def main():          help="postgresql database connection string",          default="postgres:///sandcrawler")      parser.add_argument('--s3-url', -        help="S3 (minio) backend URL", +        help="S3 (seaweedfs) backend URL",          default="localhost:9000")      parser.add_argument('--s3-access-key', -        help="S3 (minio) credential", -        default=os.environ.get('MINIO_ACCESS_KEY')) +        help="S3 (seaweedfs) credential", +        default=os.environ.get('SANDCRAWLER_BLOB_ACCESS_KEY') or os.environ.get('MINIO_ACCESS_KEY'))      parser.add_argument('--s3-secret-key', -        help="S3 (minio) credential", -        default=os.environ.get('MINIO_SECRET_KEY')) +        help="S3 (seaweedfs) credential", +        default=os.environ.get('SANDCRAWLER_BLOB_SECRET_KEY') or os.environ.get('MINIO_SECRET_KEY'))      parser.add_argument('--s3-bucket', -        help="S3 (minio) bucket to persist into", +        help="S3 (seaweedfs) bucket to persist into",          default="sandcrawler-dev")      subparsers = parser.add_subparsers() @@ -264,7 +264,7 @@ def main():      sub_pdf_extract.set_defaults(func=run_pdf_extract)      sub_persist_grobid = subparsers.add_parser('persist-grobid', -        help="daemon that consumes GROBID output from Kafka and pushes to minio and postgres") +        help="daemon that consumes GROBID output from Kafka and pushes to S3 (seaweedfs) and postgres")      sub_persist_grobid.add_argument('--s3-only',          action='store_true',          help="only upload TEI-XML to S3 (don't write to database)") @@ -274,7 +274,7 @@ def main():      sub_persist_grobid.set_defaults(func=run_persist_grobid)      sub_persist_pdftext = subparsers.add_parser('persist-pdftext', -        help="daemon that consumes pdftext output from Kafka and pushes to minio and postgres") +        help="daemon that consumes pdftext output from Kafka and pushes to S3 (seaweedfs) and postgres")      sub_persist_pdftext.add_argument('--s3-only',          action='store_true',          help="only upload TEI-XML to S3 (don't write to database)") @@ -284,7 +284,7 @@ def main():      sub_persist_pdftext.set_defaults(func=run_persist_pdftext)      sub_persist_thumbnail = subparsers.add_parser('persist-thumbnail', -        help="daemon that consumes thumbnail output from Kafka and pushes to minio and postgres") +        help="daemon that consumes thumbnail output from Kafka and pushes to S3 (seaweedfs) and postgres")      sub_persist_thumbnail.set_defaults(func=run_persist_thumbnail)      sub_persist_pdftrio = subparsers.add_parser('persist-pdftrio',  | 
