diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-21 16:56:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-21 16:56:01 -0700 |
commit | 86d15bda26280437ac7a853e73d460d0bf9dd418 (patch) | |
tree | cfd8347bb1f4e98cdab67cebb4637421458673a9 /extra/elasticsearch/transform_release.py | |
parent | d495df1f76c44b7e09db2fb8b93615ffcdf6b818 (diff) | |
download | fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.tar.gz fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.zip |
first pass at a release elastic schema
Diffstat (limited to 'extra/elasticsearch/transform_release.py')
-rwxr-xr-x | extra/elasticsearch/transform_release.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py new file mode 100755 index 00000000..30449e18 --- /dev/null +++ b/extra/elasticsearch/transform_release.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +import sys +import json + +def transform(m): + + if m['state'] != 'active': + return None + + # First, the easy ones (direct copy) + t = dict( + ident = m['ident'], + revision = m['revision'], + title = m['title'], + release_date = m.get('release_date'), + release_type = m.get('release_type'), + release_status = m.get('release_status'), + language = m.get('language'), + doi = m.get('doi'), + pmid = m.get('pmid'), + pmcid = m.get('pmcid'), + isbn13 = m.get('isbn13'), + core_id = m.get('core_id'), + wikidata_qid = m.get('wikidata_qid') + ) + + container = m.get('container') + if container: + t['publisher'] = countainer.get('publisher') + t['container_title'] = countainer.get('title') + t['container_issnl'] = countainer.get('issnl') + container_extra = container.get('extra') + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + t['container_is_kept'] = container_extra.get('is_kept') + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = m.get('publisher') + t['container_title'] = m.get('container_title') + + files = m.get('files', []) + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = m.get('extra') + if extra: + t['in_shadow'] = extra.get('in_shadow') + t['any_abstract'] = bool(t.get('abstracts')) + + author_names = [] + for contrib in m.get('contribs', []): + if contrib.get('raw_name'): + author_names.append(contrib.get('raw_name')) + return t + +def run(): + for line in sys.stdin: + obj = transform(json.loads(line)) + if obj: + print(json.dumps(obj)) + +if __name__=="__main__": + run() |