diff options
Diffstat (limited to 'python/fatcat_tools/importers/ingest.py')
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 25 |
1 files changed, 21 insertions, 4 deletions
diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index bdfd2835..4772bfaa 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -31,6 +31,12 @@ class IngestFileResultImporter(EntityImporter): 'fatcat-ingest-container', 'fatcat-ingest', 'arabesque', + 'mag-corpus', + 'mag', + 'unpaywall-corpus', + 'unpaywall', + 's2-corpus', + 's2', ] if kwargs.get('skip_source_whitelist', False): self.ingest_request_source_whitelist = [] @@ -54,11 +60,14 @@ class IngestFileResultImporter(EntityImporter): self.counts['skip-hit'] += 1 return False source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if self.ingest_request_source_whitelist and source not in self.ingest_request_source_whitelist: self.counts['skip-ingest_request_source'] += 1 return False if source.startswith('arabesque'): - if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi'): + if row['request'].get('link_source') not in ('arxiv', 'pmc', 'unpaywall', 'doi', 'mag', 's2'): self.counts['skip-arabesque-source'] += 1 return False if source.startswith('savepapernow'): @@ -131,7 +140,12 @@ class IngestFileResultImporter(EntityImporter): if not 'terminal_dt' in terminal: terminal['terminal_dt'] = terminal['dt'] assert len(terminal['terminal_dt']) == 14 - url = make_rel_url(terminal['terminal_url'], self.default_link_rel) + + default_rel = self.default_link_rel + if request.get('link_source') == 'doi': + default_rel = 'publisher' + default_rel = request.get('rel', default_rel) + url = make_rel_url(terminal['terminal_url'], default_rel) if not url: self.counts['skip-url'] += 1 @@ -152,8 +166,8 @@ class IngestFileResultImporter(EntityImporter): release_ids=[release_ident], urls=urls, ) - if fatcat and fatcat.get('edit_extra'): - fe.edit_extra = fatcat['edit_extra'] + if request.get('edit_extra'): + fe.edit_extra = request['edit_extra'] else: fe.edit_extra = dict() if request.get('ingest_request_source'): @@ -229,6 +243,9 @@ class SavePaperNowFileImporter(IngestFileResultImporter): def want(self, row): source = row['request'].get('ingest_request_source') + if not source: + self.counts['skip-ingest_request_source'] += 1 + return False if not source.startswith('savepapernow'): self.counts['skip-not-savepapernow'] += 1 return False |