diff options
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index c47f0aa7..33c40eff 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -29,6 +29,7 @@ class IngestFileResultImporter(EntityImporter): self.ingest_request_source_whitelist = [ 'fatcat-changelog', 'fatcat-ingest-container', + 'arabesque', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -55,6 +56,10 @@ class IngestFileResultImporter(EntityImporter): if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False + if source.startswith('arabesque'): + if row['reqeust'].get('link_source') not in ('arxiv', 'pmc'): + self.counts['skip-arabesque-source'] += 1 + return False if source.startswith('savepapernow'): # never process async savepapernow requests self.counts['skip-savepapernow'] += 1 @@ -152,20 +157,22 @@ class IngestFileResultImporter(EntityImporter): if err.status != 404: raise err + # check for existing edits-in-progress with same file hash + for other in self._entity_queue: + if other.sha1 == fe.sha1: + self.counts['skip-in-queue'] += 1 + return False + if not existing: return True + # the following checks all assume there is an existing item + if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? self.counts['exists'] += 1 return False - # check for existing edits-in-progress with same file hash - for other in self._entity_queue: - if other.sha1 == fe.sha1: - self.counts['skip-in-queue'] += 1 - return False - if not self.do_updates: self.counts['skip-update-disabled'] += 1 return False |