From 642501efc92e31ac438fe0f70820415c825a3802 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 23 Jan 2019 21:07:01 -0800 Subject: start changes to release ES schema --- python/fatcat_tools/transforms.py | 120 ++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 30 deletions(-) (limited to 'python/fatcat_tools') diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index b1fd7e68..2493b1ab 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -1,4 +1,5 @@ + import collections from fatcat_client import ReleaseEntity, ApiClient @@ -40,17 +41,29 @@ def release_to_elasticsearch(release): state = release.state, revision = release.revision, title = release.title, + original_title = release.original_title, release_type = release.release_type, release_status = release.release_status, language = release.language, + license = release.license_slug, doi = release.doi, pmid = release.pmid, pmcid = release.pmcid, isbn13 = release.isbn13, + wikidata_qid = release.wikidata_qid, core_id = release.core_id, - wikidata_qid = release.wikidata_qid + arxiv_id = release.core_id, + jstor_id = release.jstor_id, ) + is_oa = None + is_longtail_oa = None + in_kbart = None + in_web = False + in_dweb = False + in_ia = False + in_shadow = False + if release.release_date: # .isoformat() results in, eg, '2010-10-22' (YYYY-MM-DD) t['release_date'] = release.release_date.isoformat() @@ -59,52 +72,99 @@ def release_to_elasticsearch(release): if release.release_year is not None: t['release_year'] = release.release_year + t['any_abstract'] = len(release.abstracts) > 0 + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + container = release.container - container_is_kept = False if container: t['publisher'] = container.publisher t['container_name'] = container.name t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + t['container_type'] = container.container_type + if container.extra: + if container.extra.get('is_oa') or container.extra.get('in_doaj'): + is_oa = True + if container.extra.get('in_kbart'): + # TODO: better KBART check goes here + in_kbart = True + if container.extra.get('ia'): + # TODO: container longtail check goes here + # TODO: sim/microfilm check goes here + pass + # TODO: SHERPA/Romeo goes here else: t['publisher'] = release.publisher files = release.files or [] t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None + t['fileset_count'] = len(release.filesets or []) + t['webcapture_count'] = len(release.webcaptures or []) + any_pdf_url = None + good_pdf_url = None + best_pdf_url = None + ia_pdf_url = None for f in files: + if f.extra and f.extra.get('shadows'): + # TODO: shadow check goes here + in_shadows = True is_pdf = 'pdf' in (f.mimetype or '') for url in (f.urls or []): - if url.rel == 'webarchive': - in_wa = True - if '//web.archive.org/' in (url.url or '') or '//archive.org/' in (url.url or ''): + if url.url.lower().startswith('http'): + in_web = True + if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): + # TODO: not sure what rel will be + in_dweb = True + if is_pdf: + any_pdf_url = url.url + if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf: + is_preserved = True + good_pdf_url = url.url + if '//web.archive.org/' in url.url or '//archive.org/' in url.url: in_ia = True if is_pdf: - t['file_pdf_url'] = url.url - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url.url - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia + best_pdf_url = url.url + ia_pdf_url = url.url + # here is where we bake-in priority; IA-specific + t['best_pdf_url'] = best_pdf_url or good_pdf_url or any_pdf_url + t['ia_pdf_url'] = ia_pdf_url + + if release.license_slug: + # TODO: more/better checks here, particularly strict *not* OA licenses + if release.license_slug.startswith("CC-"): + is_oa = True extra = release.extra or dict() if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = len(release.abstracts) > 0 - t['is_kept'] = container_is_kept or extra.get('is_kept', False) + # TODO: longtail OA check from GROBID here + if extra.get('in_kbart'): + # NOTE: not actually setting this anywhere + in_kbart = True + if extra.get('is_oa'): + # NOTE: not actually setting this anywhere + is_oa = True + if extra.get('grobid'): + if not t.get('container_name'): + t['container_name'] = extra['grobid'].get('container_name') + if extra['grobid'].get('longtail_oa'): + is_longtail_oa = True + if extra.get('crossref'): + if extra['crossref'].get('archive'): + # all crossref archives are KBART, I believe + in_kbart = True - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names + if is_longtail_oa: + is_oa = True + t['is_oa'] = is_oa + t['is_longtail_oa'] = is_longtail_oa + t['in_kbart'] = in_kbart + t['in_web'] = in_web + t['in_dweb'] = in_dweb + t['in_ia'] = in_ia + t['is_preserved'] = in_ia or in_kbart return t -- cgit v1.2.3