From 433da53798b095188d3112aa3f4b509d92a3adec Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Apr 2021 14:51:32 -0700 Subject: persist: skip very long URLs --- python/sandcrawler/persist.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 0fd54a4..a388b90 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -157,6 +157,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if result['terminal_status_code']: result['terminal_status_code'] = int(result['terminal_status_code']) result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') + if len(result['terminal_url']) > 2048: + # postgresql13 doesn't like extremely large URLs in b-tree index + self.counts['skip-huge-url'] += 1 + return None return result def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: -- cgit v1.2.3