diff options
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 85 |
1 files changed, 78 insertions, 7 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index deb4ef51..c47f0aa7 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -11,8 +11,7 @@ class IngestFileResultImporter(EntityImporter): def __init__(self, api, require_grobid=True, **kwargs): - eg_desc = kwargs.pop('editgroup_description', - "Files crawled from web using sandcrawler ingest tool") + eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled from web using sandcrawler ingest tool" eg_extra = kwargs.pop('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileResultImporter') super().__init__(api, @@ -21,7 +20,6 @@ class IngestFileResultImporter(EntityImporter): **kwargs) self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel - self.default_mimetype = kwargs.get("default_mimetype", None) self.do_updates = kwargs.get("do_updates", False) self.require_grobid = require_grobid if self.require_grobid: @@ -53,9 +51,14 @@ class IngestFileResultImporter(EntityImporter): if row.get('hit') != True: self.counts['skip-hit'] += 1 return False - if self.ingest_request_source_whitelist and row['request'].get('ingest_request_source') not in self.ingest_request_source_whitelist: + source = row['request'].get('ingest_request_source') + if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False + if source.startswith('savepapernow'): + # never process async savepapernow requests + self.counts['skip-savepapernow'] += 1 + return False if not row.get('file_meta'): self.counts['skip-file-meta'] += 1 return False @@ -123,16 +126,21 @@ class IngestFileResultImporter(EntityImporter): sha1=file_meta['sha1hex'], sha256=file_meta['sha256hex'], size=file_meta['size_bytes'], - mimetype=file_meta['mimetype'] or self.default_mimetype, + mimetype=file_meta['mimetype'], release_ids=[release_ident], urls=urls, ) if fatcat and fatcat.get('edit_extra'): fe.edit_extra = fatcat['edit_extra'] + else: + fe.edit_extra = dict() if request.get('ingest_request_source'): - if not fe.edit_extra: - fe.edit_extra = dict() fe.edit_extra['ingest_request_source'] = request['ingest_request_source'] + if request.get('link_source') and request.get('link_source_id'): + fe.edit_extra['link_source'] = request['link_source'] + fe.edit_extra['link_source_id'] = request['link_source_id'] + if not fe.edit_extra: + fe.edit_extra = None return fe def try_update(self, fe): @@ -152,6 +160,12 @@ class IngestFileResultImporter(EntityImporter): self.counts['exists'] += 1 return False + # check for existing edits-in-progress with same file hash + for other in self._entity_queue: + if other.sha1 == fe.sha1: + self.counts['skip-in-queue'] += 1 + return False + if not self.do_updates: self.counts['skip-update-disabled'] += 1 return False @@ -167,3 +181,60 @@ class IngestFileResultImporter(EntityImporter): extra=self.editgroup_extra), entity_list=batch)) + +class SavePaperNowFileImporter(IngestFileResultImporter): + """ + This worker ingests from the same feed as IngestFileResultImporter, but + only imports files from anonymous save-paper-now requests, and "submits" + them for further human review (as opposed to accepting by default). + """ + + def __init__(self, api, submit_mode=True, **kwargs): + + eg_desc = kwargs.pop('editgroup_description', None) or "Files crawled after a public 'Save Paper Now' request" + eg_extra = kwargs.pop('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.IngestFileSavePaperNow') + kwargs['submit_mode'] = submit_mode + kwargs['require_grobid'] = True + kwargs['do_updates'] = False + super().__init__(api, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + def want(self, row): + + source = row['request'].get('ingest_request_source') + if not source.startswith('savepapernow'): + self.counts['skip-not-savepapernow'] += 1 + return False + if row.get('hit') != True: + self.counts['skip-hit'] += 1 + return False + if not row.get('file_meta'): + self.counts['skip-file-meta'] += 1 + return False + if self.require_grobid and row.get('grobid', {}).get('status_code') != 200: + self.counts['skip-grobid'] += 1 + return False + + return True + + def insert_batch(self, batch): + """ + Usually running in submit_mode, so we can't use auto_batch method + """ + if self.submit_mode: + eg = self.api.create_editgroup(fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra)) + for fe in batch: + self.api.create_file(eg.editgroup_id, fe) + self.api.update_editgroup(eg.editgroup_id, eg, submit=True) + else: + self.api.create_file_auto_batch(fatcat_openapi_client.FileAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + |