diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 20:39:22 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-29 21:59:05 -0800 |
commit | e047fbe1a9c495e86a6757d44eb32c9109a1b753 (patch) | |
tree | d4e1e256248993ea6897dc40055d2a7242ca6526 | |
parent | 8e8b447a1d142b7815498ffa02263c34207973b4 (diff) | |
download | fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.tar.gz fatcat-e047fbe1a9c495e86a6757d44eb32c9109a1b753.zip |
ES release schema updates
-rw-r--r-- | extra/elasticsearch/release_schema.json | 69 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 81 |
2 files changed, 122 insertions, 28 deletions
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 85026060..98a1c28e 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -27,48 +27,62 @@ "mappings": { "release": { "properties": { - "ident": { "type": "keyword" }, + "ident": { "type": "keyword", "doc_values": false }, "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "work_id": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "revision": { "type": "keyword", "doc_values": false }, + "work_id": { "type": "keyword", "doc_values": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "copy_to": "biblio" }, "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword" }, + "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "ark_id": { "type": "keyword" }, - "mag_id": { "type": "keyword" }, + "volume": { "type": "keyword", "copy_to": "biblio" }, + "issue": { "type": "keyword", "copy_to": "biblio" }, + "pages": { "type": "keyword", "copy_to": "biblio" }, + "first_page": { "type": "keyword" }, + "number": { "type": "keyword", "copy_to": "biblio" }, + "doi": { "type": "keyword", "doc_values": false }, + "doi_prefix": { "type": "keyword" }, + "doi_registrar": { "type": "keyword" }, + "pmid": { "type": "keyword", "doc_values": false }, + "pmcid": { "type": "keyword", "doc_values": false }, + "isbn13": { "type": "keyword", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "doc_values": false }, + "core_id": { "type": "keyword", "doc_values": false }, + "axiv_id": { "type": "keyword", "doc_values": false }, + "jstor_id": { "type": "keyword", "doc_values": false }, + "ark_id": { "type": "keyword", "doc_values": false }, + "mag_id": { "type": "keyword", "doc_values": false }, "license": { "type": "keyword" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "container_id": { "type": "keyword" }, "container_issnl": { "type": "keyword" }, "container_type": { "type": "keyword" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword" }, "creator_ids": { "type": "keyword" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, "any_abstract": { "type": "boolean" }, - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "doc_values": false }, "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -79,7 +93,13 @@ "in_ia_sim": { "type": "boolean" }, "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword" }, + "affilation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_id" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, "author": { "type": "alias", "path": "contrib_names" }, "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, @@ -90,6 +110,9 @@ "lang": { "type": "alias", "path": "language" }, "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, "is_kept": { "type": "alias", "path": "in_kbart" } } } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index edc68748..b997796d 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -50,6 +50,10 @@ def release_to_elasticsearch(entity, force_bool=True): release_stage = release.release_stage, withdrawn_status = release.withdrawn_status, language = release.language, + volume = release.volume, + issue = release.issue, + pages = release.pages, + number = release.number, license = release.license_slug, doi = release.ext_ids.doi, pmid = release.ext_ids.pmid, @@ -72,7 +76,7 @@ def release_to_elasticsearch(entity, force_bool=True): in_dweb = False in_ia = False in_ia_sim = False - in_shadow = False + in_shadows = False release_year = release.release_year if release.release_date: @@ -85,11 +89,15 @@ def release_to_elasticsearch(entity, force_bool=True): t['any_abstract'] = len(release.abstracts or []) > 0 t['ref_count'] = len(release.refs or []) - t['ref_linked_count'] = 0 - if release.refs: - t['ref_linked_count'] = len([1 for ref in release.refs if ref.target_release_id]) + ref_release_ids = [] + for r in (release.refs or []): + if r.target_release_id: + ref_release_ids.append(r.target_release_id) + t['ref_release_ids'] = ref_release_ids + t['ref_linked_count'] = len(ref_release_ids) t['contrib_count'] = len(release.contribs or []) contrib_names = [] + contrib_affiliations = [] creator_ids = [] for c in (release.contribs or []): if c.raw_name: @@ -98,8 +106,14 @@ def release_to_elasticsearch(entity, force_bool=True): contrib_names.append(c.surname) if c.creator_id: creator_ids.append(c.creator_id) + if c.raw_affiliation: + contrib_affiliations.append(c.raw_affiliation) t['contrib_names'] = contrib_names t['creator_ids'] = creator_ids + t['affiliations'] = contrib_affiliations + + # TODO: mapping... probably by lookup? + t['affiliation_rors'] = None container = release.container if container: @@ -140,8 +154,13 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True - else: + + # fall back to release-level container metadata if container not linked or + # missing context + if not t.get('publisher'): t['publisher'] = release.publisher + if not t.get('container_name') and release.extra: + t['container_name'] = release.extra.get('container_name') if release.ext_ids.jstor or (release.ext_ids.doi and release.ext_ids.doi.startswith('10.2307/')): in_jstor = True @@ -203,6 +222,46 @@ def release_to_elasticsearch(entity, force_bool=True): if extra['crossref'].get('archive'): # all crossref archives are KBART, I believe in_kbart = True + # backwards compatible subtitle fetching + if not t['subtitle'] and extra.get('subtitle'): + if type(extra['subtitle']) == list: + t['subtitle'] = extra['subtitle'][0] + else: + t['subtitle'] = extra['subtitle'] + + t['first_page'] = None + if release.pages: + first = release.pages.split('-')[0] + first = first.replace('p', '') + if release.pages.isdigit(): + t['first_page'] = release.pages + # TODO: non-numerical first pages + + t['ia_microfilm_url'] = None + if in_ia_sim: + # TODO: determine URL somehow? I think this is in flux. Will probably + # need extra metadata in the container extra field. + # special case as a demo for now. + if release.container_id == "hl5g6d5msjcl7hlbyyvcsbhc2u" \ + and release.year in (2011, 2013) \ + and release.volume.isdigit() \ + and t['first_page']: + t['ia_microfilm_url'] = "https://archive.org/details/sim_bjog_{}-{:02d}/page/n{}".format( + release.year, + release.volume - 1, + t['first_page'], + ) + + t['doi_registrar'] = None + if extra and t['doi']: + for k in ('crossref', 'datacite', 'jalc'): + if k in extra: + t['doi_registrar'] = k + if not 'doi_registrar' in t: + t['doi_registrar'] = 'crossref' + + if t['doi']: + t['doi_prefix'] = t['doi'].split('/')[0] if is_longtail_oa: is_oa = True @@ -215,6 +274,7 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = bool(in_jstor) t['in_web'] = bool(in_web) t['in_dweb'] = bool(in_dweb) + t['in_shadows'] = bool(in_shadows) else: t['is_oa'] = is_oa t['is_longtail_oa'] = is_longtail_oa @@ -223,9 +283,20 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_jstor'] = in_jstor t['in_web'] = in_web t['in_dweb'] = in_dweb + t['in_shadows'] = in_shadows t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + + if in_ia: + t['preservation'] = 'bright' + elif in_kbart or in_jstor: + t['preservation'] = 'dark_only' + elif in_shadows: + t['preservation'] = 'shadows_only' + else: + t['preservation'] = 'none' + return t def container_to_elasticsearch(entity, force_bool=True): |