summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-11-15 17:49:05 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-11-15 17:49:05 -0800
commit8627c8cb97b0960315cd9b308a4b39bb2febf558 (patch)
treeb995c2e0d5be65d7cffd28fd3399f4ec979202d3
parent4693394d69667570a81126ea727e9ad0ed8e1582 (diff)
downloadfatcat-8627c8cb97b0960315cd9b308a4b39bb2febf558.tar.gz
fatcat-8627c8cb97b0960315cd9b308a4b39bb2febf558.zip
re-order ingest want() for better stats
-rw-r--r--python/fatcat_tools/importers/ingest.py17
1 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index 12e14e52..bb410b63 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -46,18 +46,21 @@ class IngestFileResultImporter(EntityImporter):
The current logic is intentionally conservative as a first step.
"""
- if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200:
- self.counts['skip-grobid'] += 1
+ if row.get('hit') != True:
+ self.counts['skip-hit'] += 1
return False
- if self.ingest_request_source_whitelist and row.get('ingest_request_source') not in self.ingest_request_source_whitelist:
+ if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
- if row.get('hit') == True and row.get('file_meta'):
- return True
- else:
- self.counts['skip-hit'] += 1
+ if not row.get('file_meta'):
+ self.counts['skip-file-meta'] += 1
+ return False
+ if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
+ self.counts['skip-grobid'] += 1
return False
+ return True
+
def parse_record(self, row):
request = row['request']