summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-07 20:05:29 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-07 20:05:31 -0700
commitde0fb59f0e36d8079649feefb7592189d8f7c6ed (patch)
treebd2c09a4421ef934810f9bc412cf443a14afd6c8
parent4f80b87722d64f27c985f0040ea177269b6e028b (diff)
downloadfatcat-de0fb59f0e36d8079649feefb7592189d8f7c6ed.tar.gz
fatcat-de0fb59f0e36d8079649feefb7592189d8f7c6ed.zip
release ES transform tweaks
pass-through publisher_type from container extra metadata (ES field already existed; this is from newer chocula metadata) count arxiv and PMCID papers which haven't been crawled (by IA) as "dark", not "bright"
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 2eb18fbf..a618992c 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -162,6 +162,8 @@ def release_to_elasticsearch(entity, force_bool=True):
if c_extra.get('country'):
t['country_code'] = c_extra['country']
t['country_code_upper'] = c_extra['country'].upper()
+ if c_extra.get('publisher_type'):
+ t['publisher_type'] = c_extra['publisher_type']
# fall back to release-level container metadata if container not linked or
# missing context
@@ -297,11 +299,11 @@ def release_to_elasticsearch(entity, force_bool=True):
t['in_shadows'] = in_shadows
t['in_ia'] = bool(in_ia)
- t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor)
+ t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'))
- if in_ia or t.get('pmcid') or t.get('arxiv_id'):
+ if in_ia:
t['preservation'] = 'bright'
- elif in_kbart or in_jstor:
+ elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'):
t['preservation'] = 'dark'
elif in_shadows:
t['preservation'] = 'shadows_only'