diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-04-12 14:51:32 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-12 14:51:32 -0700 |
commit | 433da53798b095188d3112aa3f4b509d92a3adec (patch) | |
tree | 952833288c53886da382973c37d9d5a8d0ba5429 | |
parent | 03c1a78a47eae5ad7d864c3f8e22dfdff9f68934 (diff) | |
download | sandcrawler-433da53798b095188d3112aa3f4b509d92a3adec.tar.gz sandcrawler-433da53798b095188d3112aa3f4b509d92a3adec.zip |
persist: skip very long URLs
-rw-r--r-- | python/sandcrawler/persist.py | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 0fd54a4..a388b90 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -157,6 +157,10 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): if result['terminal_status_code']: result['terminal_status_code'] = int(result['terminal_status_code']) result['terminal_sha1hex'] = terminal.get('terminal_sha1hex') + if len(result['terminal_url']) > 2048: + # postgresql13 doesn't like extremely large URLs in b-tree index + self.counts['skip-huge-url'] += 1 + return None return result def result_to_html_meta(self, record: dict) -> Optional[HtmlMetaRow]: |