From 6e0736cebcb2b1e5ddbae03127572ad9d1ffca49 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 1 Oct 2021 15:11:38 -0700 Subject: ingest importer behavior tweaks - change order of 'want()' checks, so that result counts are clearer - don't require GROBID success for file imports with SPN --- python/fatcat_tools/importers/ingest.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'python') diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index ae3e147a..fc02058b 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -324,7 +324,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow') kwargs['submit_mode'] = submit_mode - kwargs['require_grobid'] = True + kwargs['require_grobid'] = False kwargs['do_updates'] = False super().__init__(api, editgroup_description=eg_desc, @@ -333,9 +333,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): - if not self.want_file(row): - return False - source = row['request'].get('ingest_request_source') if not source: self.counts['skip-ingest_request_source'] += 1 @@ -343,10 +340,14 @@ class SavePaperNowFileImporter(IngestFileResultImporter): if not source.startswith('savepapernow'): self.counts['skip-not-savepapernow'] += 1 return False + if row.get('hit') != True: self.counts['skip-hit'] += 1 return False + if not self.want_file(row): + return False + return True def insert_batch(self, batch): @@ -390,14 +391,13 @@ class IngestWebResultImporter(IngestFileResultImporter): if not self.want_ingest(row): return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 - return False - # webcapture-specific filters if row['request'].get('ingest_type') != 'html': self.counts['skip-ingest-type'] += 1 return False + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): self.counts['skip-mimetype'] += 1 return False -- cgit v1.2.3