diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 17:50:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-09 17:50:25 -0700 |
commit | 09e8e07602d04acef6fada5c82f1ae6268d03d3f (patch) | |
tree | cfefd1835d701027be929d553891b6f28a9d9968 /fatcat_covid19 | |
parent | 71b7423fd251a27bdeb08545746cb4e5a0d33c9e (diff) | |
download | fatcat-covid19-09e8e07602d04acef6fada5c82f1ae6268d03d3f.tar.gz fatcat-covid19-09e8e07602d04acef6fada5c82f1ae6268d03d3f.zip |
transform hacks for new fatcat documents
Diffstat (limited to 'fatcat_covid19')
-rw-r--r-- | fatcat_covid19/transform.py | 17 |
1 files changed, 16 insertions, 1 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py index 1002c10..338b3e8 100644 --- a/fatcat_covid19/transform.py +++ b/fatcat_covid19/transform.py @@ -169,7 +169,7 @@ def fulltext_to_elasticsearch(row, force_bool=True): t['doi_prefix'] = t['doi'].split('/')[0] # special-case medrxiv/biorxiv content - if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'): + if not t.get('release_stage') and not t.get('container_name') and (t.get('doi') or '').startswith('10.1101/20'): t['container_name'] = 'biorXiv/medrXiv' t['release_stage'] = 'draft' if t.get('release_type') in ['post', None]: @@ -228,12 +228,27 @@ def fulltext_to_elasticsearch(row, force_bool=True): abstracts.append(paper['abstract']) if not t['license']: t['license'] = paper.get('license') or None + + if 'fatcat_hit' in row: + t['source_tags'].append('fatcat') t['abstract'] = abstracts t['abstract_lang'] = list(set(abstract_langs)) t['source_tags'] = list(set(t['source_tags'])) + ### filter out some documents + # figures, component of a larger work, drop it + if t['title'].lower().startswith('figure') and not t.get('release_type'): + return None + + ### clean up some documents + # protein databank + if t.get('doi_prefix') == '10.2210' and not t.get('release_type'): + t['release_type'] = 'dataset' + t['release_stage'] = 'published' + t['container_name'] = 'Protein Data Bank' + return t def transform_es_file(json_input, json_output): |