aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-12-30 11:40:40 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-30 11:40:40 -0800
commit44b91bed5a0fb3dab46fd0641691bb9e6b5b6078 (patch)
treea9c18c11b318f9fe2920e1f2621d10fbcf7d78fb /python/fatcat_tools/importers
parent13cb25d3a49186645335db43468bbf6340404fe1 (diff)
downloadfatcat-44b91bed5a0fb3dab46fd0641691bb9e6b5b6078.tar.gz
fatcat-44b91bed5a0fb3dab46fd0641691bb9e6b5b6078.zip
web ingest: terminal URL mismatch as skip, not assert
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/ingest.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 7a98775d..04ff8db6 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter):
if 'revisit_cdx' in row:
terminal_cdx = row['revisit_cdx']
assert terminal_cdx['surt']
- assert terminal_cdx['url'] == terminal['terminal_url']
+ if terminal_cdx['url'] != terminal['terminal_url']:
+ self.counts['skip-terminal-url-mismatch'] += 1
+ return None
wc_cdx = []
# primary resource first