aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-27 17:05:56 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-27 17:05:59 -0700
commit3e6ec2fbf71518397353d7ea25bb474ba2278a58 (patch)
tree23c057cc7552521f9c5bd7bd5531135c525d90a4
parent433da53798b095188d3112aa3f4b509d92a3adec (diff)
downloadsandcrawler-3e6ec2fbf71518397353d7ea25bb474ba2278a58.tar.gz
sandcrawler-3e6ec2fbf71518397353d7ea25bb474ba2278a58.zip
ingest: cap max body size to ~128 MByte
Should resolve 'magic' OOM errors in production.
-rw-r--r--python/sandcrawler/ingest.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index abcc156..eb8e256 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -26,6 +26,8 @@ from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.xml import xml_reserialize
+MAX_BODY_SIZE_BYTES = 128*1024*1024
+
class IngestFileWorker(SandcrawlerWorker):
"""
High level flow is to look in history first, then go to live web if
@@ -576,6 +578,10 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'null-body'
return result
+ if len(resource.body) > MAX_BODY_SIZE_BYTES:
+ result['status'] = 'body-too-large'
+ return result
+
file_meta = gen_file_metadata(resource.body)
try:
file_meta, resource = fix_transfer_encoding(file_meta, resource)