aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-09 17:50:25 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-09 17:50:25 -0700
commit09e8e07602d04acef6fada5c82f1ae6268d03d3f (patch)
treecfefd1835d701027be929d553891b6f28a9d9968
parent71b7423fd251a27bdeb08545746cb4e5a0d33c9e (diff)
downloadfatcat-covid19-09e8e07602d04acef6fada5c82f1ae6268d03d3f.tar.gz
fatcat-covid19-09e8e07602d04acef6fada5c82f1ae6268d03d3f.zip
transform hacks for new fatcat documents
-rw-r--r--fatcat_covid19/transform.py17
1 files changed, 16 insertions, 1 deletions
diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py
index 1002c10..338b3e8 100644
--- a/fatcat_covid19/transform.py
+++ b/fatcat_covid19/transform.py
@@ -169,7 +169,7 @@ def fulltext_to_elasticsearch(row, force_bool=True):
t['doi_prefix'] = t['doi'].split('/')[0]
# special-case medrxiv/biorxiv content
- if not t.get('release_stage') and not t.get('container_name') and t.get('doi', '').startswith('10.1101/20'):
+ if not t.get('release_stage') and not t.get('container_name') and (t.get('doi') or '').startswith('10.1101/20'):
t['container_name'] = 'biorXiv/medrXiv'
t['release_stage'] = 'draft'
if t.get('release_type') in ['post', None]:
@@ -228,12 +228,27 @@ def fulltext_to_elasticsearch(row, force_bool=True):
abstracts.append(paper['abstract'])
if not t['license']:
t['license'] = paper.get('license') or None
+
+ if 'fatcat_hit' in row:
+ t['source_tags'].append('fatcat')
t['abstract'] = abstracts
t['abstract_lang'] = list(set(abstract_langs))
t['source_tags'] = list(set(t['source_tags']))
+ ### filter out some documents
+ # figures, component of a larger work, drop it
+ if t['title'].lower().startswith('figure') and not t.get('release_type'):
+ return None
+
+ ### clean up some documents
+ # protein databank
+ if t.get('doi_prefix') == '10.2210' and not t.get('release_type'):
+ t['release_type'] = 'dataset'
+ t['release_stage'] = 'published'
+ t['container_name'] = 'Protein Data Bank'
+
return t
def transform_es_file(json_input, json_output):