summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/work_pipeline.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-20 20:39:02 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-20 20:39:02 -0700
commit5a508d61daf23a4bfa337c4229bbb6795b69fbd2 (patch)
tree3a8e744411c2db215d666cd600d4679c5a16e9a9 /fatcat_scholar/work_pipeline.py
parentc71314e46dcf18905d1957579a211bb47c520d57 (diff)
downloadfatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.tar.gz
fatcat-scholar-5a508d61daf23a4bfa337c4229bbb6795b69fbd2.zip
fixes from manual testing
Diffstat (limited to 'fatcat_scholar/work_pipeline.py')
-rw-r--r--fatcat_scholar/work_pipeline.py21
1 files changed, 13 insertions, 8 deletions
diff --git a/fatcat_scholar/work_pipeline.py b/fatcat_scholar/work_pipeline.py
index c93cb29..9ce72b1 100644
--- a/fatcat_scholar/work_pipeline.py
+++ b/fatcat_scholar/work_pipeline.py
@@ -2,6 +2,7 @@
import os
import io
import sys
+import minio
import argparse
from pydantic import BaseModel, validator
from typing import List, Dict, Tuple, Optional, Any, Sequence
@@ -88,14 +89,17 @@ class WorkPipeline():
if not grobid_meta or grobid_meta['status'] != 'success':
return None
#print(grobid_meta)
- grobid_xml = self.sandcrawler_s3_client.get_blob(
- folder="grobid",
- sha1hex=fe.sha1,
- extension=".tei.xml",
- prefix="",
- bucket="sandcrawler",
- )
- #print(grobid_xml)
+ try:
+ grobid_xml = self.sandcrawler_s3_client.get_blob(
+ folder="grobid",
+ sha1hex=fe.sha1,
+ extension=".tei.xml",
+ prefix="",
+ bucket="sandcrawler",
+ )
+ #print(grobid_xml)
+ except minio.error.NoSuchKey:
+ return None
return dict(
tei_xml=grobid_xml,
release_ident=release_ident,
@@ -338,6 +342,7 @@ def main():
access_key=os.environ.get('MINIO_ACCESS_KEY'),
secret_key=os.environ.get('MINIO_SECRET_KEY'),
),
+ fulltext_cache_dir=args.fulltext_cache_dir,
)
if args.func == 'run_releases':