diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-07 20:05:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-07 20:05:31 -0700 |
commit | de0fb59f0e36d8079649feefb7592189d8f7c6ed (patch) | |
tree | bd2c09a4421ef934810f9bc412cf443a14afd6c8 | |
parent | 4f80b87722d64f27c985f0040ea177269b6e028b (diff) | |
download | fatcat-de0fb59f0e36d8079649feefb7592189d8f7c6ed.tar.gz fatcat-de0fb59f0e36d8079649feefb7592189d8f7c6ed.zip |
release ES transform tweaks
pass-through publisher_type from container extra metadata (ES field
already existed; this is from newer chocula metadata)
count arxiv and PMCID papers which haven't been crawled (by IA) as
"dark", not "bright"
-rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 2eb18fbf..a618992c 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -162,6 +162,8 @@ def release_to_elasticsearch(entity, force_bool=True): if c_extra.get('country'): t['country_code'] = c_extra['country'] t['country_code_upper'] = c_extra['country'].upper() + if c_extra.get('publisher_type'): + t['publisher_type'] = c_extra['publisher_type'] # fall back to release-level container metadata if container not linked or # missing context @@ -297,11 +299,11 @@ def release_to_elasticsearch(entity, force_bool=True): t['in_shadows'] = in_shadows t['in_ia'] = bool(in_ia) - t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) + t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id')) - if in_ia or t.get('pmcid') or t.get('arxiv_id'): + if in_ia: t['preservation'] = 'bright' - elif in_kbart or in_jstor: + elif in_kbart or in_jstor or t.get('pmcid') or t.get('arxiv_id'): t['preservation'] = 'dark' elif in_shadows: t['preservation'] = 'shadows_only' |