From 8627c8cb97b0960315cd9b308a4b39bb2febf558 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Nov 2019 17:49:05 -0800 Subject: re-order ingest want() for better stats --- python/fatcat_tools/importers/ingest.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 12e14e52..bb410b63 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -46,18 +46,21 @@ class IngestFileResultImporter(EntityImporter): The current logic is intentionally conservative as a first step. """ - if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: - self.counts['skip-grobid'] += 1 + if row.get('hit') != True: + self.counts['skip-hit'] += 1 return False - if self.ingest_request_source_whitelist and row.get('ingest_request_source') not in self.ingest_request_source_whitelist: + if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False - if row.get('hit') == True and row.get('file_meta'): - return True - else: - self.counts['skip-hit'] += 1 + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 return False + return True + def parse_record(self, row): request = row['request'] -- cgit v1.2.3