From d5d83762063b8ec7f512c20567f46c03f2e6b542 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 29 Jan 2020 21:57:32 -0800 Subject: update ES docs and proposal --- proposals/2020_elasticsearch_schemas.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'proposals') diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 83db884f..5fb28d19 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -14,8 +14,6 @@ Simple additions: - pages - `first_page` (parsed from pages) (?) - number -- `in_shadow` -- OA license slug (?) - `doi_prefix` - `doi_registrar` (based on extra) - `first_author` (surname; for matching) @@ -25,6 +23,8 @@ Simple additions: - referenced releases idents - contrib creator idents +Add affiliations, both as raw strings and ROR identifiers. + ## Preservation Summary Field @@ -128,8 +128,8 @@ hit does not}"). ## Container Fields -- `all_issns` -- `release_count` +- `issn` (all issns) +- `original_name` The `release_count` would not be indexed (left null) by default, and would be "patched" in to entities by a separate script (periodically?). -- cgit v1.2.3 From 81e0784813500a39955c20278140e25d7940d9c6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 22:04:35 -0800 Subject: improve is_oa flag accuracy Particularly, the ezb=green match seems mostly incorrect. Note that pmcid being assigned could still be in an embargo window? --- proposals/2020_elasticsearch_schemas.md | 4 ++-- python/fatcat_tools/transforms/elasticsearch.py | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'proposals') diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 5fb28d19..c3e79073 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which is: - `bright` -- `dark_only` -- `shadow_only` +- `dark` +- `shadows_only` - `none` Note that these don't align with OA color or work-level preservation (aka, no diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8581febd..87e054ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('road'): if c_extra['road'].get('as_of'): is_oa = True - if c_extra.get('ezb'): - if c_extra['ezb'].get('color') == 'green': - is_oa = True if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True @@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True + if release.license_slug.startswith("ARXIV-"): + is_oa = True extra = release.extra or dict() if extra: @@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) - if in_ia: + if in_ia or t.get('pmcid') or t.get('arxiv_id'): t['preservation'] = 'bright' elif in_kbart or in_jstor: - t['preservation'] = 'dark_only' + t['preservation'] = 'dark' elif in_shadows: t['preservation'] = 'shadows_only' else: @@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('road'): if extra['road'].get('as_of'): in_road = True - if extra.get('ezb'): - if extra['ezb'].get('color') == 'green': - is_oa = True if extra.get('szczepanski'): if extra['szczepanski'].get('as_of'): is_oa = True -- cgit v1.2.3