aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-12-07 19:44:53 -0800
committerBryan Newbold <bnewbold@archive.org>2021-12-07 19:44:53 -0800
commit89b5f51e57d3a0cc043640262e396e28297e7c00 (patch)
treeadc729b2255eafc58de16f3c91c4f4ea2608ad09
parent833f9bb5181419ca9f5af0f9ba0e2e047ee164d4 (diff)
downloadsandcrawler-89b5f51e57d3a0cc043640262e396e28297e7c00.tar.gz
sandcrawler-89b5f51e57d3a0cc043640262e396e28297e7c00.zip
grobid: set a maximum file size (256 MByte)
-rw-r--r--python/sandcrawler/grobid.py8
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 6dbed16..1f957da 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -11,6 +11,8 @@ from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte
+
def clean_crossref_unstructured(raw: str) -> str:
"""
@@ -86,6 +88,12 @@ class GrobidClient(object):
"""
assert blob
+ if len(blob) > MAX_GROBID_BLOB_SIZE:
+ return {
+ "status": "blob-too-large",
+ "error_msg": f"Not going to process very large file ({len(blob)} bytes)",
+ }
+
if consolidate_mode is None:
consolidate_mode = self.consolidate_mode
assert consolidate_mode is not None