diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-02-26 22:04:35 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-02-26 22:05:33 -0800 |
commit | 81e0784813500a39955c20278140e25d7940d9c6 (patch) | |
tree | a48b51d85bfa27441cf0de2e8689c43cd4e3d048 | |
parent | 0ab3f66664fd4cc63cf9040e351d725c6a5c22b9 (diff) | |
download | fatcat-81e0784813500a39955c20278140e25d7940d9c6.tar.gz fatcat-81e0784813500a39955c20278140e25d7940d9c6.zip |
improve is_oa flag accuracy
Particularly, the ezb=green match seems mostly incorrect.
Note that pmcid being assigned could still be in an embargo window?
-rw-r--r-- | proposals/2020_elasticsearch_schemas.md | 4 | ||||
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 12 |
2 files changed, 6 insertions, 10 deletions
diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md index 5fb28d19..c3e79073 100644 --- a/proposals/2020_elasticsearch_schemas.md +++ b/proposals/2020_elasticsearch_schemas.md @@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which is: - `bright` -- `dark_only` -- `shadow_only` +- `dark` +- `shadows_only` - `none` Note that these don't align with OA color or work-level preservation (aka, no diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 8581febd..87e054ec 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('road'): if c_extra['road'].get('as_of'): is_oa = True - if c_extra.get('ezb'): - if c_extra['ezb'].get('color') == 'green': - is_oa = True if c_extra.get('szczepanski'): if c_extra['szczepanski'].get('as_of'): is_oa = True @@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True): # TODO: more/better checks here, particularly strict *not* OA licenses if release.license_slug.startswith("CC-"): is_oa = True + if release.license_slug.startswith("ARXIV-"): + is_oa = True extra = release.extra or dict() if extra: @@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_ia'] = bool(in_ia) t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) - if in_ia: + if in_ia or t.get('pmcid') or t.get('arxiv_id'): t['preservation'] = 'bright' elif in_kbart or in_jstor: - t['preservation'] = 'dark_only' + t['preservation'] = 'dark' elif in_shadows: t['preservation'] = 'shadows_only' else: @@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True): if extra.get('road'): if extra['road'].get('as_of'): in_road = True - if extra.get('ezb'): - if extra['ezb'].get('color') == 'green': - is_oa = True if extra.get('szczepanski'): if extra['szczepanski'].get('as_of'): is_oa = True |