summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r--python/fatcat_tools/importers/ingest.py25
1 files changed, 21 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py
index bdfd2835..4772bfaa 100644
--- a/python/fatcat_tools/importers/ingest.py
+++ b/python/fatcat_tools/importers/ingest.py
@@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter):
'fatcat-ingest-container',
'fatcat-ingest',
'arabesque',
+ 'mag-corpus',
+ 'mag',
+ 'unpaywall-corpus',
+ 'unpaywall',
+ 's2-corpus',
+ 's2',
]
if kwargs.get('skip_source_whitelist', False):
self.ingest_request_source_whitelist = []
@@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter):
self.counts['skip-hit'] += 1
return False
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist:
self.counts['skip-ingest_request_source'] += 1
return False
if source.startswith('arabesque'):
- if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'):
+ if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'):
self.counts['skip-arabesque-source'] += 1
return False
if source.startswith('savepapernow'):
@@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter):
if not 'terminal_dt' in terminal:
terminal['terminal_dt'] = terminal['dt']
assert len(terminal['terminal_dt']) == 14
- url = make_rel_url(terminal['terminal_url'], self.default_link_rel)
+
+ default_rel = self.default_link_rel
+ if request.get('link_source') == 'doi':
+ default_rel = 'publisher'
+ default_rel = request.get('rel', default_rel)
+ url = make_rel_url(terminal['terminal_url'], default_rel)
if not url:
self.counts['skip-url'] += 1
@@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter):
release_ids=[release_ident],
urls=urls,
)
- if fatcat and fatcat.get('edit_extra'):
- fe.edit_extra = fatcat['edit_extra']
+ if request.get('edit_extra'):
+ fe.edit_extra = request['edit_extra']
else:
fe.edit_extra = dict()
if request.get('ingest_request_source'):
@@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter):
def want(self, row):
source = row['request'].get('ingest_request_source')
+ if not source:
+ self.counts['skip-ingest_request_source'] += 1
+ return False
if not source.startswith('savepapernow'):
self.counts['skip-not-savepapernow'] += 1
return False