aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-12 14:51:32 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-12 14:51:32 -0700
commit433da53798b095188d3112aa3f4b509d92a3adec (patch)
tree952833288c53886da382973c37d9d5a8d0ba5429
parent03c1a78a47eae5ad7d864c3f8e22dfdff9f68934 (diff)
downloadsandcrawler-433da53798b095188d3112aa3f4b509d92a3adec.tar.gz
sandcrawler-433da53798b095188d3112aa3f4b509d92a3adec.zip
persist: skip very long URLs
-rw-r--r--python/sandcrawler/persist.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 0fd54a4..a388b90 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -157,6 +157,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
if result['terminal_status_code']:
result['terminal_status_code'] = int(result['terminal_status_code'])
result['terminal_sha1hex'] = terminal.get('terminal_sha1hex')
+ if len(result['terminal_url']) > 2048:
+ # postgresql13 doesn't like extremely large URLs in b-tree index
+ self.counts['skip-huge-url'] += 1
+ return None
return result
def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: