summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r--python/fatcat_tools/importers/ingest.py19
1 files changed, 13 insertions, 6 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index c47f0aa7..33c40eff 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -29,6 +29,7 @@ class IngestFileResultImporter(EntityImporter):
self.ingest_request_source_whitelist = [
'fatcat-changelog',
'fatcat-ingest-container',
+ 'arabesque',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -55,6 +56,10 @@ class IngestFileResultImporter(EntityImporter):
if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
+ if source.startswith('arabesque'):
+ if row['reqeust'].get('link_source') not in ('arxiv', 'pmc'):
+ self.counts['skip-arabesque-source'] += 1
+ return False
if source.startswith('savepapernow'):
# never process async savepapernow requests
self.counts['skip-savepapernow'] += 1
@@ -152,20 +157,22 @@ class IngestFileResultImporter(EntityImporter):
if err.status != 404:
raise err
+ # check for existing edits-in-progress with same file hash
+ for other in self._entity_queue:
+ if other.sha1 == fe.sha1:
+ self.counts['skip-in-queue'] += 1
+ return False
+
if not existing:
return True
+ # the following checks all assume there is an existing item
+
if (fe.release_ids[0] in existing.release_ids) and existing.urls:
# TODO: could still, in theory update with the new URL?
self.counts['exists'] += 1
return False
- # check for existing edits-in-progress with same file hash
- for other in self._entity_queue:
- if other.sha1 == fe.sha1:
- self.counts['skip-in-queue'] += 1
- return False
-
if not self.do_updates:
self.counts['skip-update-disabled'] += 1
return False