diff options
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 12e14e52..bb410b63 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -46,18 +46,21 @@ class IngestFileResultImporter(EntityImporter): The current logic is intentionally conservative as a first step. """ - if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: - self.counts['skip-grobid'] += 1 + if row.get('hit') != True: + self.counts['skip-hit'] += 1 return False - if self.ingest_request_source_whitelist and row.get('ingest_request_source') not in self.ingest_request_source_whitelist: + if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False - if row.get('hit') == True and row.get('file_meta'): - return True - else: - self.counts['skip-hit'] += 1 + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 return False + return True + def parse_record(self, row): request = row['request'] |