aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-10-01 15:11:38 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-10-01 15:11:40 -0700
commit6e0736cebcb2b1e5ddbae03127572ad9d1ffca49 (patch)
tree03f797b0ffd5c6ca8c627bfeca62dfb574eccaee /python
parentdd219464cfc90b9b469fd851b48b08668ff17ba8 (diff)
downloadfatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.tar.gz
fatcat-6e0736cebcb2b1e5ddbae03127572ad9d1ffca49.zip
ingest importer behavior tweaks
- change order of 'want()' checks, so that result counts are clearer - don't require GROBID success for file imports with SPN
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/ingest.py16
1 files changed, 8 insertions, 8 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index ae3e147a..fc02058b 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -324,7 +324,7 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow')
kwargs['submit_mode'] = submit_mode
- kwargs['require_grobid'] = True
+ kwargs['require_grobid'] = False
kwargs['do_updates'] = False
super().__init__(api,
editgroup_description=eg_desc,
@@ -333,9 +333,6 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def want(self, row):
- if not self.want_file(row):
- return False
-
source = row['request'].get('ingest_request_source')
if not source:
self.counts['skip-ingest_request_source'] += 1
@@ -343,10 +340,14 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
if not source.startswith('savepapernow'):
self.counts['skip-not-savepapernow'] += 1
return False
+
if row.get('hit') != True:
self.counts['skip-hit'] += 1
return False
+ if not self.want_file(row):
+ return False
+
return True
def insert_batch(self, batch):
@@ -390,14 +391,13 @@ class IngestWebResultImporter(IngestFileResultImporter):
if not self.want_ingest(row):
return False
- if not row.get('file_meta'):
- self.counts['skip-file-meta'] += 1
- return False
-
# webcapture-specific filters
if row['request'].get('ingest_type') != 'html':
self.counts['skip-ingest-type'] += 1
return False
+ if not row.get('file_meta'):
+ self.counts['skip-file-meta'] += 1
+ return False
if row['file_meta'].get('mimetype') not in ("text/html", "application/xhtml+xml"):
self.counts['skip-mimetype'] += 1
return False