diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-12-07 19:44:53 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-12-07 19:44:53 -0800 |
commit | 89b5f51e57d3a0cc043640262e396e28297e7c00 (patch) | |
tree | adc729b2255eafc58de16f3c91c4f4ea2608ad09 | |
parent | 833f9bb5181419ca9f5af0f9ba0e2e047ee164d4 (diff) | |
download | sandcrawler-89b5f51e57d3a0cc043640262e396e28297e7c00.tar.gz sandcrawler-89b5f51e57d3a0cc043640262e396e28297e7c00.zip |
grobid: set a maximum file size (256 MByte)
-rw-r--r-- | python/sandcrawler/grobid.py | 8 |
1 files changed, 8 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 6dbed16..1f957da 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -11,6 +11,8 @@ from .ia import WaybackClient from .misc import gen_file_metadata, requests_retry_session from .workers import SandcrawlerFetchWorker, SandcrawlerWorker +MAX_GROBID_BLOB_SIZE: int = 256 * 1024 * 1024 # ~256 MByte + def clean_crossref_unstructured(raw: str) -> str: """ @@ -86,6 +88,12 @@ class GrobidClient(object): """ assert blob + if len(blob) > MAX_GROBID_BLOB_SIZE: + return { + "status": "blob-too-large", + "error_msg": f"Not going to process very large file ({len(blob)} bytes)", + } + if consolidate_mode is None: consolidate_mode = self.consolidate_mode assert consolidate_mode is not None |