aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-12-12 10:15:05 -0800
committerBryan Newbold <bnewbold@robocracy.org>2019-12-12 17:50:39 -0800
commit9be73bd3b5323cb5a1ac3b63c392d343c18a5a8c (patch)
tree7b11e326c9be7667b3df676ba4295a1017a6df66 /python/fatcat_tools/importers
parent12d1df8d0e0c55bd71f5348d10836c135a0d6210 (diff)
downloadfatcat-9be73bd3b5323cb5a1ac3b63c392d343c18a5a8c.tar.gz
fatcat-9be73bd3b5323cb5a1ac3b63c392d343c18a5a8c.zip
savepapernow result importer
Based on ingest-file-results importer
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r--python/fatcat_tools/importers/__init__.py2
-rw-r--r--python/fatcat_tools/importers/ingest.py67
2 files changed, 65 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index 025a111c..bb9c5b17 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -26,4 +26,4 @@ from .orcid import OrcidImporter
from .arabesque import ArabesqueMatchImporter, ARABESQUE_MATCH_WHERE_CLAUSE
from .wayback_static import auto_wayback_static
from .cdl_dash_dat import auto_cdl_dash_dat
-from .ingest import IngestFileResultImporter
+from .ingest import IngestFileResultImporter, SavePaperNowFileImporter
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index deb4ef51..e5484048 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -11,8 +11,7 @@ class IngestFileResultImporter(EntityImporter):
def __init__(self, api, require_grobid=True, **kwargs):
- eg_desc = kwargs.pop('editgroup_description',
- "Files crawled from web using sandcrawler ingest tool")
+ eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool"
eg_extra = kwargs.pop('editgroup_extra', dict())
eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter')
super().__init__(api,
@@ -53,9 +52,14 @@ class IngestFileResultImporter(EntityImporter):
if row.get('hit') != True:
self.counts['skip-hit'] += 1
return False
- if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist:
+ source = row['request'].get('ingest_request_source')
+ if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
+ if source.startswith('savepapernow'):
+ # never process async savepapernow requests
+ self.counts['skip-savepapernow'] += 1
+ return False
if not row.get('file_meta'):
self.counts['skip-file-meta'] += 1
return False
@@ -167,3 +171,60 @@ class IngestFileResultImporter(EntityImporter):
extra=self.editgroup_extra),
entity_list=batch))
+
+class SavePaperNowFileImporter(IngestFileResultImporter):
+ """
+ This worker ingests from the same feed as IngestFileResultImporter, but
+ only imports files from anonymous save-paper-now requests, and "submits"
+ them for further human review (as opposed to accepting by default).
+ """
+
+ def __init__(self, api, submit_mode=True, **kwargs):
+
+ eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request"
+ eg_extra = kwargs.pop('editgroup_extra', dict())
+ eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow')
+ kwargs['submit_mode'] = submit_mode
+ kwargs['require_grobid'] = True
+ kwargs['do_updates'] = False
+ super().__init__(api,
+ editgroup_description=eg_desc,
+ editgroup_extra=eg_extra,
+ **kwargs)
+
+ def want(self, row):
+
+ source = row['request'].get('ingest_request_source')
+ if not source.startswith('savepapernow'):
+ self.counts['skip-not-savepapernow'] += 1
+ return False
+ if row.get('hit') != True:
+ self.counts['skip-hit'] += 1
+ return False
+ if not row.get('file_meta'):
+ self.counts['skip-file-meta'] += 1
+ return False
+ if self.require_grobid and row.get('grobid', {}).get('status_code') != 200:
+ self.counts['skip-grobid'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
+ """
+ Usually running in submit_mode, so we can't use auto_batch method
+ """
+ if self.submit_mode:
+ eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra))
+ for fe in batch:
+ self.api.create_file(eg.editgroup_id, fe)
+ self.api.update_editgroup(eg.editgroup_id, eg, submit=True)
+ else:
+ self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch(
+ editgroup=fatcat_openapi_client.Editgroup(
+ description=self.editgroup_description,
+ extra=self.editgroup_extra),
+ entity_list=batch))
+