summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-07-23 14:47:56 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-07-23 14:48:00 -0700
commita05ecb7959bc57b8f1c3607e1c941e8e25d9a87b (patch)
tree1ea23eddff08b4e0e098a6019a1c118ed240ac3f /python/fatcat_tools/transforms
parent6a87b3d2e1b315d35ccaa13457571d73afaf5e6b (diff)
downloadfatcat-a05ecb7959bc57b8f1c3607e1c941e8e25d9a87b.tar.gz
fatcat-a05ecb7959bc57b8f1c3607e1c941e8e25d9a87b.zip
make in_kbart transform inclusive of last year
Frequently when looking at preservation coverage of journals, the current year shows as "un-preserved" when in fact there is robust KBART (keepers, eg CLOCKSS/Portico) coverage. This is partially because we don't update containers with KBART year spans very frequently (which is on us), and partially because KBART reports are often a bit out of day (eg, doesn't show coverage for the current year. For that matter, they probably take a few months to update the previous year as well, but that is a larger time span to fudge over. This patch means we will count Portico/LOCKSS/etc coverage for "last year" to count as coverage of publications dated "this year". Note that for this to be effective/correct, it is assumed that we will update containers with coverage year spans at least once a year, and that we will re-index all releases at least once a year.
Diffstat (limited to 'python/fatcat_tools/transforms')
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py9
1 files changed, 9 insertions, 0 deletions
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 8ec9c164..0b04db86 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,4 +1,6 @@
+import datetime
+
import tldextract
@@ -114,6 +116,7 @@ def release_to_elasticsearch(entity, force_bool=True):
# TODO: mapping... probably by lookup?
t['affiliation_rors'] = None
+ this_year = datetime.date.today().year
container = release.container
if container:
t['publisher'] = container.publisher
@@ -130,6 +133,12 @@ def release_to_elasticsearch(entity, force_bool=True):
in_kbart = in_jstor
for archive in ('portico', 'lockss', 'clockss'):
in_kbart = in_kbart or check_kbart(release_year, c_extra['kbart'].get(archive))
+ # recent KBART coverage is often not updated for the
+ # current year. So for current-year publications, consider
+ # coverage from *last* year to also be included in the
+ # Keeper
+ if not in_kbart and release_year == this_year:
+ in_kbart = in_kbart or check_kbart(this_year - 1, c_extra['kbart'].get(archive))
if c_extra.get('ia'):
if c_extra['ia'].get('sim') and release_year: