From 81e0784813500a39955c20278140e25d7940d9c6 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 26 Feb 2020 22:04:35 -0800 Subject: improve is_oa flag accuracy Particularly, the ezb=green match seems mostly incorrect. Note that pmcid being assigned could still be in an embargo window? --- proposals/2020_elasticsearch_schemas.md | 4 ++-- python/fatcat_tools/transforms/elasticsearch.py | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 5fb28d19..c3e79073 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which is: - `bright` -- `dark_only` -- `shadow_only` +- `dark` +- `shadows_only` - `none` Note that these don't align with OA color or work-level preservation (aka, no diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8581febd..87e054ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('road'): if c_extra['road'].get('as_of'): is_oa = True - if c_extra.get('ezb'): - if c_extra['ezb'].get('color') == 'green': - is_oa = True if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True @@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True + if release.license_slug.startswith("ARXIV-"): + is_oa = True extra = release.extra or dict() if extra: @@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) - if in_ia: + if in_ia or t.get('pmcid') or t.get('arxiv_id'): t['preservation'] = 'bright' elif in_kbart or in_jstor: - t['preservation'] = 'dark_only' + t['preservation'] = 'dark' elif in_shadows: t['preservation'] = 'shadows_only' else: @@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('road'): if extra['road'].get('as_of'): in_road = True - if extra.get('ezb'): - if extra['ezb'].get('color') == 'green': - is_oa = True if extra.get('szczepanski'): if extra['szczepanski'].get('as_of'): is_oa = True -- cgit v1.2.3