aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-09-28 18:06:48 -0700
committerBryan Newbold <bnewbold@archive.org>2022-09-28 18:06:51 -0700
commita04468041cd81ad90aa76ec15788a5ffacb6eec2 (patch)
treed7d1670341833711d2b227f6b61d0f7a8b7ad58a /python
parentca5d6ac9145f962aa4d52b1ab060f918415b3a57 (diff)
downloadsandcrawler-a04468041cd81ad90aa76ec15788a5ffacb6eec2.tar.gz
sandcrawler-a04468041cd81ad90aa76ec15788a5ffacb6eec2.zip
persist: skip huge URLs
and fix some minor doc typos
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/persist.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index d753380..f682572 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -101,6 +101,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if raw["ingest_type"] not in ("pdf", "xml", "html"):
self.counts["skip-ingest-type"] += 1
return None
+ # limit on base_url length
+ if len(raw["base_url"]) > 1500:
+ self.counts["skip-url-too-long"] += 1
+ return None
request = {
"ingest_type": raw["ingest_type"],
"base_url": raw["base_url"],