diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-20 20:39:02 -0700 |
commit | 5a508d61daf23a4bfa337c4229bbb6795b69fbd2 (patch) | |
tree | 3a8e744411c2db215d666cd600d4679c5a16e9a9 /fatcat_scholar/work_pipeline.py | |
parent | c71314e46dcf18905d1957579a211bb47c520d57 (diff) | |
download | fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.tar.gz fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.zip |
fixes from manual testing
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r-- | fatcat_scholar/work_pipeline.py | 21 |
1 files changed, 13 insertions, 8 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py index c93cb29..9ce72b1 100644 --- a/fatcat_scholar/work_pipeline.py +++ b/fatcat_scholar/work_pipeline.py @@ -2,6 +2,7 @@ import os import io import sys +import minio import argparse from pydantic import BaseModel, validator from typing import List, Dict, Tuple, Optional, Any, Sequence @@ -88,14 +89,17 @@ class WorkPipeline(): if not grobid_meta or grobid_meta['status'] != 'success': return None #print(grobid_meta) - grobid_xml = self.sandcrawler_s3_client.get_blob( - folder="grobid", - sha1hex=fe.sha1, - extension=".tei.xml", - prefix="", - bucket="sandcrawler", - ) - #print(grobid_xml) + try: + grobid_xml = self.sandcrawler_s3_client.get_blob( + folder="grobid", + sha1hex=fe.sha1, + extension=".tei.xml", + prefix="", + bucket="sandcrawler", + ) + #print(grobid_xml) + except minio.error.NoSuchKey: + return None return dict( tei_xml=grobid_xml, release_ident=release_ident, @@ -338,6 +342,7 @@ def main(): access_key=os.environ.get('MINIO_ACCESS_KEY'), secret_key=os.environ.get('MINIO_SECRET_KEY'), ), + fulltext_cache_dir=args.fulltext_cache_dir, ) if args.func == 'run_releases': |