diff options
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r-- | python/sandcrawler/grobid.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 6dbed16..1f957da 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -11,6 +11,8 @@ from .ia import WaybackClient from .misc import gen_file_metadata, requests_retry_session from .workers import SandcrawlerFetchWorker, SandcrawlerWorker +MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte + def clean_crossref_unstructured(raw: str) -> str: """ @@ -86,6 +88,12 @@ class GrobidClient(object): """ assert blob + if len(blob) > MAX_GROBID_BLOB_SIZE: + return { + "status": "blob-too-large", + "error_msg": f"Not going to process very large file ({len(blob)} bytes)", + } + if consolidate_mode is None: consolidate_mode = self.consolidate_mode assert consolidate_mode is not None |