aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-08 15:08:23 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-08 15:08:23 -0700
commitc72da804be4b0c90352ce3f760cdf1113c8b3098 (patch)
tree5e85da32b9617e415e774978c2355a3b93b28e2c
parentb92aef0bb235e8e38af2fbc25e33639f3654b191 (diff)
downloadfatcat-covid19-c72da804be4b0c90352ce3f760cdf1113c8b3098.tar.gz
fatcat-covid19-c72da804be4b0c90352ce3f760cdf1113c8b3098.zip
special-case arxiv/medrxiv/biorxiv container names
-rw-r--r--fatcat_covid19/transform.py11
1 files changed, 11 insertions, 0 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py
index 0eb1b0a..9616c57 100644
--- a/fatcat_covid19/transform.py
+++ b/fatcat_covid19/transform.py
@@ -168,6 +168,17 @@ def fulltext_to_elasticsearch(row, force_bool=True):
if t['doi']:
t['doi_prefix'] = t['doi'].split('/')[0]
+ # special-case medrxiv/biorxiv content
+ if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'):
+ t['container_name'] = 'biorXiv / medrXiv'
+ t['release_stage'] = 'draft'
+ if t.get('release_type') in ['post', None]:
+ t['release_type'] = 'article-journal'
+
+ # special-case arxiv
+ if not t.get('container_name') and t.get('arxiv_id'):
+ t['container_name'] = 'arXiv'
+
# then the fulltext stuff
t['fulltext']['status'] = row.get('fulltext_status', 'none')
if 'fulltext_file' in row: