summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-02-26 22:04:35 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-26 22:05:33 -0800
commit81e0784813500a39955c20278140e25d7940d9c6 (patch)
treea48b51d85bfa27441cf0de2e8689c43cd4e3d048
parent0ab3f66664fd4cc63cf9040e351d725c6a5c22b9 (diff)
downloadfatcat-81e0784813500a39955c20278140e25d7940d9c6.tar.gz
fatcat-81e0784813500a39955c20278140e25d7940d9c6.zip
improve is_oa flag accuracy
Particularly, the ezb=green match seems mostly incorrect. Note that pmcid being assigned could still be in an embargo window?
-rw-r--r--proposals/2020_elasticsearch_schemas.md4
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py12
2 files changed, 6 insertions, 10 deletions
diff --git a/proposals/2020_elasticsearch_schemas.md b/proposals/2020_elasticsearch_schemas.md
index 5fb28d19..c3e79073 100644
--- a/proposals/2020_elasticsearch_schemas.md
+++ b/proposals/2020_elasticsearch_schemas.md
@@ -33,8 +33,8 @@ status (from `in_kbart`, `in_ia`, etc) to a `preservation_status` flag which
is:
- `bright`
-- `dark_only`
-- `shadow_only`
+- `dark`
+- `shadows_only`
- `none`
Note that these don't align with OA color or work-level preservation (aka, no
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 8581febd..87e054ec 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -149,9 +149,6 @@ def release_to_elasticsearch(entity, force_bool=True):
if c_extra.get('road'):
if c_extra['road'].get('as_of'):
is_oa = True
- if c_extra.get('ezb'):
- if c_extra['ezb'].get('color') == 'green':
- is_oa = True
if c_extra.get('szczepanski'):
if c_extra['szczepanski'].get('as_of'):
is_oa = True
@@ -210,6 +207,8 @@ def release_to_elasticsearch(entity, force_bool=True):
# TODO: more/better checks here, particularly strict *not* OA licenses
if release.license_slug.startswith("CC-"):
is_oa = True
+ if release.license_slug.startswith("ARXIV-"):
+ is_oa = True
extra = release.extra or dict()
if extra:
@@ -293,10 +292,10 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_ia'] = bool(in_ia)
t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
- if in_ia:
+ if in_ia or t.get('pmcid') or t.get('arxiv_id'):
t['preservation'] = 'bright'
elif in_kbart or in_jstor:
- t['preservation'] = 'dark_only'
+ t['preservation'] = 'dark'
elif in_shadows:
t['preservation'] = 'shadows_only'
else:
@@ -367,9 +366,6 @@ def container_to_elasticsearch(entity, force_bool=True):
if extra.get('road'):
if extra['road'].get('as_of'):
in_road = True
- if extra.get('ezb'):
- if extra['ezb'].get('color') == 'green':
- is_oa = True
if extra.get('szczepanski'):
if extra['szczepanski'].get('as_of'):
is_oa = True