diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 17:49:05 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-11-15 17:49:05 -0800 |
commit | 8627c8cb97b0960315cd9b308a4b39bb2febf558 (patch) | |
tree | b995c2e0d5be65d7cffd28fd3399f4ec979202d3 | |
parent | 4693394d69667570a81126ea727e9ad0ed8e1582 (diff) | |
download | fatcat-8627c8cb97b0960315cd9b308a4b39bb2febf558.tar.gz fatcat-8627c8cb97b0960315cd9b308a4b39bb2febf558.zip |
re-order ingest want() for better stats
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 17 |
1 files changed, 10 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 12e14e52..bb410b63 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -46,18 +46,21 @@ class IngestFileResultImporter(EntityImporter): The current logic is intentionally conservative as a first step. """ - if self.require_grobid and not row.get('grobid', {}).get('status_code') == 200: - self.counts['skip-grobid'] += 1 + if row.get('hit') != True: + self.counts['skip-hit'] += 1 return False - if self.ingest_request_source_whitelist and row.get('ingest_request_source') not in self.ingest_request_source_whitelist: + if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False - if row.get('hit') == True and row.get('file_meta'): - return True - else: - self.counts['skip-hit'] += 1 + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 return False + return True + def parse_record(self, row): request = row['request'] |