import collections from fatcat_client import ReleaseEntity, ApiClient def entity_to_dict(entity): """ Hack to take advantage of the code-generated serialization code """ ac = ApiClient() return ac.sanitize_for_serialization(entity) def entity_from_json(json_str, entity_type): """ Hack to take advantage of the code-generated deserialization code """ ac = ApiClient() thing = collections.namedtuple('Thing', ['data']) thing.data = json_str return ac.deserialize(thing, entity_type) def release_to_elasticsearch(release): """ Converts from an entity model/schema to elasticsearch oriented schema. Returns: dict Raises exception on error (never returns None) """ if release.state in ('redirect', 'deleted'): return dict( ident = release.ident, state = release.state, ) elif release.state != 'active': raise ValueError("Unhandled release state: {}".format(release.state)) # First, the easy ones (direct copy) t = dict( ident = release.ident, state = release.state, revision = release.revision, title = release.title, original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, wikidata_qid = release.wikidata_qid, core_id = release.core_id, arxiv_id = release.core_id, jstor_id = release.jstor_id, ) is_oa = None is_longtail_oa = None in_kbart = None in_web = False in_dweb = False in_ia = False in_shadow = False if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() if release.release_year is None: t['release_year'] = release.release_date.year if release.release_year is not None: t['release_year'] = release.release_year t['any_abstract'] = len(release.abstracts) > 0 t['ref_count'] = len(release.refs or []) t['contrib_count'] = len(release.contribs or []) contrib_names = [] for c in (release.contribs or []): if c.raw_name: contrib_names.append(c.raw_name) t['contrib_names'] = contrib_names container = release.container if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl t['container_type'] = container.container_type if container.extra: if container.extra.get('is_oa') or container.extra.get('in_doaj'): is_oa = True if container.extra.get('in_kbart'): # TODO: better KBART check goes here in_kbart = True if container.extra.get('ia'): # TODO: container longtail check goes here # TODO: sim/microfilm check goes here pass # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) t['fileset_count'] = len(release.filesets or []) t['webcapture_count'] = len(release.webcaptures or []) any_pdf_url = None good_pdf_url = None best_pdf_url = None ia_pdf_url = None for f in files: if f.extra and f.extra.get('shadows'): # TODO: shadow check goes here in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): if url.url.lower().startswith('http'): in_web = True if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): # TODO: not sure what rel will be in_dweb = True if is_pdf: any_pdf_url = url.url if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: is_preserved = True good_pdf_url = url.url if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: best_pdf_url = url.url ia_pdf_url = url.url # here is where we bake-in priority; IA-specific t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url t['ia_pdf_url'] = ia_pdf_url if release.license_slug: # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True extra = release.extra or dict() if extra: # TODO: longtail OA check from GROBID here if extra.get('in_kbart'): # NOTE: not actually setting this anywhere in_kbart = True if extra.get('is_oa'): # NOTE: not actually setting this anywhere is_oa = True if extra.get('grobid'): if not t.get('container_name'): t['container_name'] = extra['grobid'].get('container_name') if extra['grobid'].get('longtail_oa'): is_longtail_oa = True if extra.get('crossref'): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe in_kbart = True if is_longtail_oa: is_oa = True t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa t['in_kbart'] = in_kbart t['in_web'] = in_web t['in_dweb'] = in_dweb t['in_ia'] = in_ia t['is_preserved'] = in_ia or in_kbart return t