aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 6dbed16..1f957da 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -11,6 +11,8 @@ from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
def clean_crossref_unstructured(raw: str) -> str:
"""
@@ -86,6 +88,12 @@ class GrobidClient(object):
"""
assert blob
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
assert consolidate_mode is not None