From c72da804be4b0c90352ce3f760cdf1113c8b3098 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 8 Apr 2020 15:08:23 -0700 Subject: special-case arxiv/medrxiv/biorxiv container names --- fatcat_covid19/transform.py | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fatcat_covid19') diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 0eb1b0a..9616c57 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -168,6 +168,17 @@ def fulltext_to_elasticsearch(row, force_bool=True): if t['doi']: t['doi_prefix'] = t['doi'].split('/')[0] + # special-case medrxiv/biorxiv content + if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'): + t['container_name'] = 'biorXiv / medrXiv' + t['release_stage'] = 'draft' + if t.get('release_type') in ['post', None]: + t['release_type'] = 'article-journal' + + # special-case arxiv + if not t.get('container_name') and t.get('arxiv_id'): + t['container_name'] = 'arXiv' + # then the fulltext stuff t['fulltext']['status'] = row.get('fulltext_status', 'none') if 'fulltext_file' in row: -- cgit v1.2.3