diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-09-28 18:06:48 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-09-28 18:06:51 -0700 |
commit | a04468041cd81ad90aa76ec15788a5ffacb6eec2 (patch) | |
tree | d7d1670341833711d2b227f6b61d0f7a8b7ad58a /python | |
parent | ca5d6ac9145f962aa4d52b1ab060f918415b3a57 (diff) | |
download | sandcrawler-a04468041cd81ad90aa76ec15788a5ffacb6eec2.tar.gz sandcrawler-a04468041cd81ad90aa76ec15788a5ffacb6eec2.zip |
persist: skip huge URLs
and fix some minor doc typos
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/persist.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index d753380..f682572 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -101,6 +101,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if raw["ingest_type"] not in ("pdf", "xml", "html"): self.counts["skip-ingest-type"] += 1 return None + # limit on base_url length + if len(raw["base_url"]) > 1500: + self.counts["skip-url-too-long"] += 1 + return None request = { "ingest_type": raw["ingest_type"], "base_url": raw["base_url"], |