diff options
-rw-r--r-- | python/sandcrawler/persist.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 0fd54a4..a388b90 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -157,6 +157,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if result['terminal_status_code']: result['terminal_status_code'] = int(result['terminal_status_code']) result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') + if len(result['terminal_url']) > 2048: + # postgresql13 doesn't like extremely large URLs in b-tree index + self.counts['skip-huge-url'] += 1 + return None return result def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: |