From a04468041cd81ad90aa76ec15788a5ffacb6eec2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 28 Sep 2022 18:06:48 -0700 Subject: persist: skip huge URLs and fix some minor doc typos --- python/sandcrawler/persist.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index d753380..f682572 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -101,6 +101,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if raw["ingest_type"] not in ("pdf", "xml", "html"): self.counts["skip-ingest-type"] += 1 return None + # limit on base_url length + if len(raw["base_url"]) > 1500: + self.counts["skip-url-too-long"] += 1 + return None request = { "ingest_type": raw["ingest_type"], "base_url": raw["base_url"], -- cgit v1.2.3