diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-08 15:08:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-08 15:08:23 -0700 |
commit | c72da804be4b0c90352ce3f760cdf1113c8b3098 (patch) | |
tree | 5e85da32b9617e415e774978c2355a3b93b28e2c /fatcat_covid19 | |
parent | b92aef0bb235e8e38af2fbc25e33639f3654b191 (diff) | |
download | fatcat-covid19-c72da804be4b0c90352ce3f760cdf1113c8b3098.tar.gz fatcat-covid19-c72da804be4b0c90352ce3f760cdf1113c8b3098.zip |
special-case arxiv/medrxiv/biorxiv container names
Diffstat (limited to 'fatcat_covid19')
-rw-r--r-- | fatcat_covid19/transform.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 0eb1b0a..9616c57 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -168,6 +168,17 @@ def fulltext_to_elasticsearch(row, force_bool=True): if t['doi']: t['doi_prefix'] = t['doi'].split('/')[0] + # special-case medrxiv/biorxiv content + if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'): + t['container_name'] = 'biorXiv / medrXiv' + t['release_stage'] = 'draft' + if t.get('release_type') in ['post', None]: + t['release_type'] = 'article-journal' + + # special-case arxiv + if not t.get('container_name') and t.get('arxiv_id'): + t['container_name'] = 'arXiv' + # then the fulltext stuff t['fulltext']['status'] = row.get('fulltext_status', 'none') if 'fulltext_file' in row: |