diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-15 14:13:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-15 14:13:34 -0800 |
commit | 689da76d1c759d6368d760b4a1fa942e16095a40 (patch) | |
tree | ca78cd6841875b3c7d55d046b3c7a206e604b60f /python/fatcat_tools/importers | |
parent | a02d51650bb5a3165ec89e822f43ff98807d01c3 (diff) | |
download | fatcat-689da76d1c759d6368d760b4a1fa942e16095a40.tar.gz fatcat-689da76d1c759d6368d760b4a1fa942e16095a40.zip |
ingest: improve tests, support old ingest results
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 15 |
1 files changed, 12 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 3d391bd8..82a33aaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -112,9 +112,18 @@ class IngestFileResultImporter(EntityImporter): terminal = row.get('terminal') if not terminal: - # TODO: support archive.org hits? - self.counts['skip-no-terminal'] += 1 - return None + # support old cdx-only ingest results + cdx = row.get('cdx') + if not cdx: + # TODO: support archive.org hits? + self.counts['skip-no-terminal'] += 1 + return None + else: + terminal = { + 'terminal_url': cdx['url'], + 'terminal_dt': cdx['datetime'], + 'terminal_status_code': cdx.get('status_code') or cdx.get('http_status'), + } # work around old schema if not 'terminal_url' in terminal: |