aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 16:02:32 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 16:02:32 -0800
commit7cc0b6fd1a7b9e3b9bb2c8ee85e319a7aae9c5e8 (patch)
treef5c762dc61917cb60b01810a2cef7779dc52a7f2
parentf31f6ba5917403b85892c7345a08b8de93dae501 (diff)
downloadsandcrawler-7cc0b6fd1a7b9e3b9bb2c8ee85e319a7aae9c5e8.tar.gz
sandcrawler-7cc0b6fd1a7b9e3b9bb2c8ee85e319a7aae9c5e8.zip
disable CDX best lookup 'collapse'; leave comment
-rw-r--r--python/sandcrawler/ia.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 3c1d2f9..096d5d4 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -168,7 +168,9 @@ class CdxApiClient:
'matchType': 'exact',
'limit': -25,
'output': 'json',
- 'collapse': 'timestamp:6',
+ # Collapsing seems efficient, but is complex; would need to include
+ # other filters and status code in filter
+ #'collapse': 'timestamp:6',
'filter': '!mimetype:warc/revisit',
}
if max_age_days: