diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-30 11:40:40 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-30 11:40:40 -0800 |
commit | 44b91bed5a0fb3dab46fd0641691bb9e6b5b6078 (patch) | |
tree | a9c18c11b318f9fe2920e1f2621d10fbcf7d78fb | |
parent | 13cb25d3a49186645335db43468bbf6340404fe1 (diff) | |
download | fatcat-44b91bed5a0fb3dab46fd0641691bb9e6b5b6078.tar.gz fatcat-44b91bed5a0fb3dab46fd0641691bb9e6b5b6078.zip |
web ingest: terminal URL mismatch as skip, not assert
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 7a98775d..04ff8db6 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -437,7 +437,9 @@ class IngestWebResultImporter(IngestFileResultImporter): if 'revisit_cdx' in row: terminal_cdx = row['revisit_cdx'] assert terminal_cdx['surt'] - assert terminal_cdx['url'] == terminal['terminal_url'] + if terminal_cdx['url'] != terminal['terminal_url']: + self.counts['skip-terminal-url-mismatch'] += 1 + return None wc_cdx = [] # primary resource first |