diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 15:11:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 15:11:40 -0700 |
commit | 6e0736cebcb2b1e5ddbae03127572ad9d1ffca49 (patch) | |
tree | 03f797b0ffd5c6ca8c627bfeca62dfb574eccaee /python/fatcat_tools | |
parent | dd219464cfc90b9b469fd851b48b08668ff17ba8 (diff) | |
download | fatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.tar.gz fatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.zip |
ingest importer behavior tweaks
- change order of 'want()' checks, so that result counts are clearer
- don't require GROBID success for file imports with SPN
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index ae3e147a..fc02058b 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -324,7 +324,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter): eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow') kwargs['submit_mode'] = submit_mode - kwargs['require_grobid'] = True + kwargs['require_grobid'] = False kwargs['do_updates'] = False super().__init__(api, editgroup_description=eg_desc, @@ -333,9 +333,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): - if not self.want_file(row): - return False - source = row['request'].get('ingest_request_source') if not source: self.counts['skip-ingest_request_source'] += 1 @@ -343,10 +340,14 @@ class SavePaperNowFileImporter(IngestFileResultImporter): if not source.startswith('savepapernow'): self.counts['skip-not-savepapernow'] += 1 return False + if row.get('hit') != True: self.counts['skip-hit'] += 1 return False + if not self.want_file(row): + return False + return True def insert_batch(self, batch): @@ -390,14 +391,13 @@ class IngestWebResultImporter(IngestFileResultImporter): if not self.want_ingest(row): return False - if not row.get('file_meta'): - self.counts['skip-file-meta'] += 1 - return False - # webcapture-specific filters if row['request'].get('ingest_type') != 'html': self.counts['skip-ingest-type'] += 1 return False + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"): self.counts['skip-mimetype'] += 1 return False |