import sys import json import argparse import datetime from bs4 import BeautifulSoup from fatcat_covid19.common import * UNWANTED_ABSTRACT_PREFIXES = [ # roughly sort this long to short 'Abstract No Abstract ', 'Publisher Summary ', 'Abstract ', 'ABSTRACT ', 'Summary ', 'Background: ', 'Background ', ] def fulltext_to_elasticsearch(row, force_bool=True): """ Converts from fulltext content and release model/schema to elasticsearch oriented schema. Returns: dict Raises exception on error (never returns None) """ if not 'fatcat_release' in row: # skip papers that don't match to a fatcat release return None release = row['fatcat_release'] abstracts = [] abstract_langs = [] # first, easy fatcat metadata t = { 'fatcat_ident': release['ident'], 'fatcat_revision': release['revision'], 'fulltext': dict(), } BIBLIO_KEYS = [ 'work_id', 'title', 'subtitle', 'original_title', 'release_type', 'release_stage', 'release_year', 'release_date', 'withdrawn_status', 'language', 'volume', 'issue', 'pages', 'number', 'license', ] EXT_IDS = [ 'doi', 'pmid', 'pmcid', 'isbn13', 'wikidata_qid', 'arxiv', 'jstor', 'mag', ] for key in BIBLIO_KEYS: t[key] = release.get(key) or None for key in EXT_IDS: if key in ['arxiv', 'jstor', 'mag']: t[key + "_id"] = release['ext_ids'].get(key) or None else: t[key] = release['ext_ids'].get(key) or None t['contrib_count'] = len(release['contribs'] or []) if release.get('abstracts'): for a in release['abstracts']: # hack to (partially) clean up common JATS abstract display case if a.get('mimetype') == 'application/xml+jats' or "