From 7bf8ae73b8b5dfca4d17f353cdbec669e69bbbec Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 3 Apr 2020 15:06:11 -0700 Subject: refactor elastic transform into CLI tool --- covid19_tool.py | 14 +++ elastic_transform.py | 223 -------------------------------------------- fatcat_covid19/transform.py | 204 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 223 deletions(-) delete mode 100755 elastic_transform.py create mode 100644 fatcat_covid19/transform.py diff --git a/covid19_tool.py b/covid19_tool.py index 23a2c6c..1cf8dce 100755 --- a/covid19_tool.py +++ b/covid19_tool.py @@ -10,6 +10,8 @@ import sys import argparse from fatcat_covid19.webface import app +from fatcat_covid19.derivatives import enrich_derivatives_file +from fatcat_covid19.transform import transform_es_file def main(): @@ -58,6 +60,16 @@ def main(): help="directory to look for files (in 'pdf' subdirectory)", default="fulltext_web") + sub_transform_es = subparsers.add_parser('transform-es', + help="transform fulltext JSON to elasticsearch schema JSON") + sub_transform_es.add_argument('json_file', + help="input JSON rows file (fulltext)", + type=argparse.FileType('r')) + sub_transform_es.add_argument('--json-output', + help="file to write to", + type=argparse.FileType('r'), + default=sys.stdout) + args = parser.parse_args() if args.action == 'webface': @@ -65,6 +77,8 @@ def main(): if args.action == 'derivatives': enrich_derivatives_file(args.json_file, args.json_output, args.base_dir) + if args.action == 'transform-es': + transform_es_file(args.json_file, args.json_output) else: print("tell me what to do!") sys.exit(-1) diff --git a/elastic_transform.py b/elastic_transform.py deleted file mode 100755 index 04fba33..0000000 --- a/elastic_transform.py +++ /dev/null @@ -1,223 +0,0 @@ -#!/usr/bin/env python3 - -""" -Takes *enriched* JSON objects which include fatcat metadata and fulltext -content, and outputs JSON lines in fatcat_fulltext schema. -""" - -import sys -import json -import argparse -import datetime - -from fatcat_covid19.common import * - - -def fulltext_to_elasticsearch(row, force_bool=True): - """ - Converts from fulltext content and release model/schema to elasticsearch - oriented schema. - - Returns: dict - Raises exception on error (never returns None) - """ - - if not 'fatcat_release' in row: - # skip papers that don't match to a fatcat release - return None - - release = row['fatcat_release'] - - abstracts = [] - abstract_langs = [] - - # first, easy fatcat metadata - t = { - 'fatcat_ident': release['ident'], - 'fatcat_revision': release['revision'], - 'fulltext': dict(), - } - BIBLIO_KEYS = [ - 'work_id', - 'title', - 'subtitle', - 'original_title', - 'release_type', - 'release_stage', - 'release_year', - 'release_date', - 'withdrawn_status', - 'language', - 'volume', - 'issue', - 'pages', - 'number', - 'license', - ] - EXT_IDS = [ - 'doi', - 'pmid', - 'pmcid', - 'isbn13', - 'wikidata_qid', - 'arxiv_id', - 'jstor_id', - 'mag_id', - ] - for key in BIBLIO_KEYS: - t[key] = release.get(key) or None - for key in EXT_IDS: - t[key] = release['ext_ids'].get(key) or None - - t['contrib_count'] = len(release['contribs'] or []) - - if release.get('abstracts'): - for a in release['abstracts']: - abstracts.append(a['content']) - abstract_langs.append(a['lang']) - - contrib_names = [] - contrib_affiliations = [] - creator_ids = [] - for c in (release['contribs'] or []): - if c.get('raw_name'): - contrib_names.append(c['raw_name']) - elif c.get('surname'): - contrib_names.append(c['surname']) - if c.get('creator_id'): - creator_ids.append(c['creator_id']) - if c.get('raw_affiliation'): - contrib_affiliations.append(c['raw_affiliation']) - t['contrib_names'] = contrib_names - t['creator_ids'] = creator_ids - t['affiliations'] = contrib_affiliations - - container = release.get('container') - if container: - t['publisher'] = container.get('publisher') - t['container_name'] = container.get('name') - t['container_original_name'] = container.get('original_name') - # this is container.ident, not release.container_id, because there may - # be a redirect involved - t['container_id'] = container['ident'] - t['container_issnl'] = container.get('issnl') - t['container_type'] = container.get('container_type') - if container.get('extra'): - c_extra = container['extra'] - if c_extra.get('country'): - t['country_code'] = c_extra['country'] - t['country_code_upper'] = c_extra['country'].upper() - - # fall back to release-level container metadata if container not linked or - # missing context - if not t.get('publisher'): - t['publisher'] = release.get('publisher') - if not t.get('container_name') and release.get('extra'): - t['container_name'] = release['extra'].get('container_name') - - extra = release['extra'] or dict() - if extra: - if not t.get('container_name'): - t['container_name'] = extra.get('container_name') - # backwards compatible subtitle fetching - if not t['subtitle'] and extra.get('subtitle'): - if type(extra['subtitle']) == list: - t['subtitle'] = extra['subtitle'][0] - else: - t['subtitle'] = extra['subtitle'] - - t['first_page'] = None - if release.get('pages'): - first = release['pages'].split('-')[0] - first = first.replace('p', '') - if first.isdigit(): - t['first_page'] = first - # TODO: non-numerical first pages - - t['doi_registrar'] = None - if extra and t['doi']: - for k in ('crossref', 'datacite', 'jalc'): - if k in extra: - t['doi_registrar'] = k - if not 'doi_registrar' in t: - t['doi_registrar'] = 'crossref' - - if t['doi']: - t['doi_prefix'] = t['doi'].split('/')[0] - - # then the fulltext stuff - t['fulltext']['status'] = row.get('fulltext_status', 'none') - if 'fulltext_file' in row: - full = row['fulltext_file'] - t['fulltext']['sha1'] = full['sha1'] - t['fulltext']['pdf_url'] = "/" + full['pdf_path'] - if full.get('pdftotext_path'): - t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path'] - if full.get('thumbnail_path'): - t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] - if full.get('grobid_xml_path'): - t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] - - if 'fulltext_grobid' in row: - grobid = row['fulltext_grobid'] - if grobid.get('abstract'): - abstracts.append(grobid['abstract']) - abstract_langs.append(grobid['language_code']) - t['fulltext']['abstract'] = grobid.get('abstract', None) - t['fulltext']['body'] = grobid.get('body', None) - t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None) - t['fulltext']['annex'] = grobid.get('annex', None) - t['fulltext']['lang'] = grobid.get('language_code', None) - elif 'fulltext_pdftotext' in row: - pdftotext = row['fulltext_pdftotext'] - t['fulltext']['body'] = pdftotext.get('body', None) - - # then other metadata stuff - if row.get('source_tags'): - # will get set-uniq at the end - t['source_tags'] = row['source_tags'] - else: - t['source_tags'] = [] - - if 'cord19_paper' in row: - t['source_tags'].append('cord19') - paper = row['cord19_paper'] - t['cord19_uid'] = paper['cord_uid'] - if paper.get('who_covidence_id'): - t['who_covidence_id'] = paper['who_covidence_id'] - t['source_tags'].append('who') - if paper.get('abstract') and not abstracts: - abstracts.append(paper['abstract']) - if not t['license']: - t['license'] = paper.get('license') or None - - t['abstract'] = abstracts - t['abstract_lang'] = list(set(abstract_langs)) - - t['source_tags'] = list(set(t['source_tags'])) - - return t - -def run(args): - for l in args.json_file: - l = json.loads(l) - result = fulltext_to_elasticsearch(l, args) - if result: - print(json.dumps(result, sort_keys=True)) - -def main(): - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('json_file', - help="fulltext content input", - type=argparse.FileType('r')) - subparsers = parser.add_subparsers() - - args = parser.parse_args() - args.session = requests_retry_session() - - run(args) - -if __name__ == '__main__': - main() - diff --git a/fatcat_covid19/transform.py b/fatcat_covid19/transform.py new file mode 100644 index 0000000..c31c9f4 --- /dev/null +++ b/fatcat_covid19/transform.py @@ -0,0 +1,204 @@ + +import sys +import json +import argparse +import datetime + +from fatcat_covid19.common import * + + +def fulltext_to_elasticsearch(row, force_bool=True): + """ + Converts from fulltext content and release model/schema to elasticsearch + oriented schema. + + Returns: dict + Raises exception on error (never returns None) + """ + + if not 'fatcat_release' in row: + # skip papers that don't match to a fatcat release + return None + + release = row['fatcat_release'] + + abstracts = [] + abstract_langs = [] + + # first, easy fatcat metadata + t = { + 'fatcat_ident': release['ident'], + 'fatcat_revision': release['revision'], + 'fulltext': dict(), + } + BIBLIO_KEYS = [ + 'work_id', + 'title', + 'subtitle', + 'original_title', + 'release_type', + 'release_stage', + 'release_year', + 'release_date', + 'withdrawn_status', + 'language', + 'volume', + 'issue', + 'pages', + 'number', + 'license', + ] + EXT_IDS = [ + 'doi', + 'pmid', + 'pmcid', + 'isbn13', + 'wikidata_qid', + 'arxiv_id', + 'jstor_id', + 'mag_id', + ] + for key in BIBLIO_KEYS: + t[key] = release.get(key) or None + for key in EXT_IDS: + t[key] = release['ext_ids'].get(key) or None + + t['contrib_count'] = len(release['contribs'] or []) + + if release.get('abstracts'): + for a in release['abstracts']: + abstracts.append(a['content']) + abstract_langs.append(a['lang']) + + contrib_names = [] + contrib_affiliations = [] + creator_ids = [] + for c in (release['contribs'] or []): + if c.get('raw_name'): + contrib_names.append(c['raw_name']) + elif c.get('surname'): + contrib_names.append(c['surname']) + if c.get('creator_id'): + creator_ids.append(c['creator_id']) + if c.get('raw_affiliation'): + contrib_affiliations.append(c['raw_affiliation']) + t['contrib_names'] = contrib_names + t['creator_ids'] = creator_ids + t['affiliations'] = contrib_affiliations + + container = release.get('container') + if container: + t['publisher'] = container.get('publisher') + t['container_name'] = container.get('name') + t['container_original_name'] = container.get('original_name') + # this is container.ident, not release.container_id, because there may + # be a redirect involved + t['container_id'] = container['ident'] + t['container_issnl'] = container.get('issnl') + t['container_type'] = container.get('container_type') + if container.get('extra'): + c_extra = container['extra'] + if c_extra.get('country'): + t['country_code'] = c_extra['country'] + t['country_code_upper'] = c_extra['country'].upper() + + # fall back to release-level container metadata if container not linked or + # missing context + if not t.get('publisher'): + t['publisher'] = release.get('publisher') + if not t.get('container_name') and release.get('extra'): + t['container_name'] = release['extra'].get('container_name') + + extra = release['extra'] or dict() + if extra: + if not t.get('container_name'): + t['container_name'] = extra.get('container_name') + # backwards compatible subtitle fetching + if not t['subtitle'] and extra.get('subtitle'): + if type(extra['subtitle']) == list: + t['subtitle'] = extra['subtitle'][0] + else: + t['subtitle'] = extra['subtitle'] + + t['first_page'] = None + if release.get('pages'): + first = release['pages'].split('-')[0] + first = first.replace('p', '') + if first.isdigit(): + t['first_page'] = first + # TODO: non-numerical first pages + + t['doi_registrar'] = None + if extra and t['doi']: + for k in ('crossref', 'datacite', 'jalc'): + if k in extra: + t['doi_registrar'] = k + if not 'doi_registrar' in t: + t['doi_registrar'] = 'crossref' + + if t['doi']: + t['doi_prefix'] = t['doi'].split('/')[0] + + # then the fulltext stuff + t['fulltext']['status'] = row.get('fulltext_status', 'none') + if 'fulltext_file' in row: + full = row['fulltext_file'] + t['fulltext']['sha1'] = full['sha1'] + t['fulltext']['pdf_url'] = "/" + full['pdf_path'] + if full.get('pdftotext_path'): + t['fulltext']['pdftotext_url'] = "/" + full['pdftotext_path'] + if full.get('thumbnail_path'): + t['fulltext']['thumbnail_url'] = "/" + full['thumbnail_path'] + if full.get('grobid_xml_path'): + t['fulltext']['grobid_xml_url'] = "/" + full['grobid_xml_path'] + + if 'fulltext_grobid' in row: + grobid = row['fulltext_grobid'] + if grobid.get('abstract'): + abstracts.append(grobid['abstract']) + abstract_langs.append(grobid['language_code']) + t['fulltext']['abstract'] = grobid.get('abstract', None) + t['fulltext']['body'] = grobid.get('body', None) + t['fulltext']['acknowledgement'] = grobid.get('acknowledgement', None) + t['fulltext']['annex'] = grobid.get('annex', None) + t['fulltext']['lang'] = grobid.get('language_code', None) + elif 'fulltext_pdftotext' in row: + pdftotext = row['fulltext_pdftotext'] + t['fulltext']['body'] = pdftotext.get('body', None) + + # then other metadata stuff + if row.get('source_tags'): + # will get set-uniq at the end + t['source_tags'] = row['source_tags'] + else: + t['source_tags'] = [] + + if 'cord19_paper' in row: + t['source_tags'].append('cord19') + paper = row['cord19_paper'] + t['cord19_uid'] = paper['cord_uid'] + if paper.get('who_covidence_id'): + t['who_covidence_id'] = paper['who_covidence_id'] + t['source_tags'].append('who') + if paper.get('abstract') and not abstracts: + abstracts.append(paper['abstract']) + if not t['license']: + t['license'] = paper.get('license') or None + + t['abstract'] = abstracts + t['abstract_lang'] = list(set(abstract_langs)) + + t['source_tags'] = list(set(t['source_tags'])) + + return t + +def transform_es_file(json_input, json_output): + """ + Takes *enriched* JSON objects which include fatcat metadata and fulltext + content, and outputs JSON lines in fatcat_fulltext schema. + """ + for l in json_input: + l = json.loads(l) + result = fulltext_to_elasticsearch(l, args) + if result: + print(json.dumps(result, sort_keys=True), file=json_output) -- cgit v1.2.3