From 09e8e07602d04acef6fada5c82f1ae6268d03d3f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 9 Apr 2020 17:50:25 -0700 Subject: transform hacks for new fatcat documents --- fatcat_covid19/transform.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 1002c10..338b3e8 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -169,7 +169,7 @@ def fulltext_to_elasticsearch(row, force_bool=True): t['doi_prefix'] = t['doi'].split('/')[0] # special-case medrxiv/biorxiv content - if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'): + if not t.get('release_stage') and not t.get('container_name') and (t.get('doi') or '').startswith('10.1101/20'): t['container_name'] = 'biorXiv/medrXiv' t['release_stage'] = 'draft' if t.get('release_type') in ['post', None]: @@ -228,12 +228,27 @@ def fulltext_to_elasticsearch(row, force_bool=True): abstracts.append(paper['abstract']) if not t['license']: t['license'] = paper.get('license') or None + + if 'fatcat_hit' in row: + t['source_tags'].append('fatcat') t['abstract'] = abstracts t['abstract_lang'] = list(set(abstract_langs)) t['source_tags'] = list(set(t['source_tags'])) + ### filter out some documents + # figures, component of a larger work, drop it + if t['title'].lower().startswith('figure') and not t.get('release_type'): + return None + + ### clean up some documents + # protein databank + if t.get('doi_prefix') == '10.2210' and not t.get('release_type'): + t['release_type'] = 'dataset' + t['release_stage'] = 'published' + t['container_name'] = 'Protein Data Bank' + return t def transform_es_file(json_input, json_output): -- cgit v1.2.3