summaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch/transform_release.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-21 16:56:01 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-21 16:56:01 -0700
commit86d15bda26280437ac7a853e73d460d0bf9dd418 (patch)
treecfd8347bb1f4e98cdab67cebb4637421458673a9 /extra/elasticsearch/transform_release.py
parentd495df1f76c44b7e09db2fb8b93615ffcdf6b818 (diff)
downloadfatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.tar.gz
fatcat-86d15bda26280437ac7a853e73d460d0bf9dd418.zip
first pass at a release elastic schema
Diffstat (limited to 'extra/elasticsearch/transform_release.py')
-rwxr-xr-xextra/elasticsearch/transform_release.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/extra/elasticsearch/transform_release.py b/extra/elasticsearch/transform_release.py
new file mode 100755
index 00000000..30449e18
--- /dev/null
+++ b/extra/elasticsearch/transform_release.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+
+def transform(m):
+
+ if m['state'] != 'active':
+ return None
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = m['ident'],
+ revision = m['revision'],
+ title = m['title'],
+ release_date = m.get('release_date'),
+ release_type = m.get('release_type'),
+ release_status = m.get('release_status'),
+ language = m.get('language'),
+ doi = m.get('doi'),
+ pmid = m.get('pmid'),
+ pmcid = m.get('pmcid'),
+ isbn13 = m.get('isbn13'),
+ core_id = m.get('core_id'),
+ wikidata_qid = m.get('wikidata_qid')
+ )
+
+ container = m.get('container')
+ if container:
+ t['publisher'] = countainer.get('publisher')
+ t['container_title'] = countainer.get('title')
+ t['container_issnl'] = countainer.get('issnl')
+ container_extra = container.get('extra')
+ if container_extra:
+ t['container_is_oa'] = container_extra.get('is_oa')
+ t['container_is_kept'] = container_extra.get('is_kept')
+ t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa')
+ else:
+ t['publisher'] = m.get('publisher')
+ t['container_title'] = m.get('container_title')
+
+ files = m.get('files', [])
+ t['file_count'] = len(files)
+ in_wa = False
+ in_ia = False
+ t['file_pdf_url'] = None
+ for f in files:
+ is_pdf = 'pdf' in f.get('mimetype', '')
+ for url in f.get('urls', []):
+ if url.get('rel', '') == 'webarchive':
+ in_wa = True
+ if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']:
+ in_ia = True
+ if is_pdf:
+ t['file_pdf_url'] = url['url']
+ if not t['file_pdf_url'] and is_pdf:
+ t['file_pdf_url'] = url['url']
+ t['file_in_webarchive'] = in_wa
+ t['file_in_ia'] = in_ia
+
+ extra = m.get('extra')
+ if extra:
+ t['in_shadow'] = extra.get('in_shadow')
+ t['any_abstract'] = bool(t.get('abstracts'))
+
+ author_names = []
+ for contrib in m.get('contribs', []):
+ if contrib.get('raw_name'):
+ author_names.append(contrib.get('raw_name'))
+ return t
+
+def run():
+ for line in sys.stdin:
+ obj = transform(json.loads(line))
+ if obj:
+ print(json.dumps(obj))
+
+if __name__=="__main__":
+ run()