From 89b5f51e57d3a0cc043640262e396e28297e7c00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 7 Dec 2021 19:44:53 -0800 Subject: grobid: set a maximum file size (256 MByte) --- python/sandcrawler/grobid.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 6dbed16..1f957da 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -11,6 +11,8 @@ from .ia import WaybackClient from .misc import gen_file_metadata, requests_retry_session from .workers import SandcrawlerFetchWorker, SandcrawlerWorker +MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte + def clean_crossref_unstructured(raw: str) -> str: """ @@ -86,6 +88,12 @@ class GrobidClient(object): """ assert blob + if len(blob) > MAX_GROBID_BLOB_SIZE: + return { + "status": "blob-too-large", + "error_msg": f"Not going to process very large file ({len(blob)} bytes)", + } + if consolidate_mode is None: consolidate_mode = self.consolidate_mode assert consolidate_mode is not None -- cgit v1.2.3