aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/persist.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index d753380..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -101,6 +101,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if raw["ingest_type"] not in ("pdf", "xml", "html"):
self.counts["skip-ingest-type"] += 1
return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
+ return None
request = {
"ingest_type": raw["ingest_type"],
"base_url": raw["base_url"],