diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 15:11:38 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-01 15:11:40 -0700 | 
| commit | 6e0736cebcb2b1e5ddbae03127572ad9d1ffca49 (patch) | |
| tree | 03f797b0ffd5c6ca8c627bfeca62dfb574eccaee /python | |
| parent | dd219464cfc90b9b469fd851b48b08668ff17ba8 (diff) | |
| download | fatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.tar.gz fatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.zip | |
ingest importer behavior tweaks
- change order of 'want()' checks, so that result counts are clearer
- don't require GROBID success for file imports with SPN
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 16 | 
1 files changed, 8 insertions, 8 deletions
| diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index ae3e147a..fc02058b 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -324,7 +324,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):          eg_extra = kwargs.pop('editgroup_extra', dict())          eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow')          kwargs['submit_mode'] = submit_mode -        kwargs['require_grobid'] = True +        kwargs['require_grobid'] = False          kwargs['do_updates'] = False          super().__init__(api,              editgroup_description=eg_desc, @@ -333,9 +333,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter):      def want(self, row): -        if not self.want_file(row): -            return False -          source = row['request'].get('ingest_request_source')          if not source:              self.counts['skip-ingest_request_source'] += 1 @@ -343,10 +340,14 @@ class SavePaperNowFileImporter(IngestFileResultImporter):          if not source.startswith('savepapernow'):              self.counts['skip-not-savepapernow'] += 1              return False +          if row.get('hit') != True:              self.counts['skip-hit'] += 1              return False +        if not self.want_file(row): +            return False +          return True      def insert_batch(self, batch): @@ -390,14 +391,13 @@ class IngestWebResultImporter(IngestFileResultImporter):          if not self.want_ingest(row):              return False -        if not row.get('file_meta'): -            self.counts['skip-file-meta'] += 1 -            return False -          # webcapture-specific filters          if row['request'].get('ingest_type') != 'html':              self.counts['skip-ingest-type'] += 1              return False +        if not row.get('file_meta'): +            self.counts['skip-file-meta'] += 1 +            return False          if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):              self.counts['skip-mimetype'] += 1              return False | 
