From 097fa7660c60e6c52ac2adbdd82fe64c122b1e42 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 26 Dec 2019 17:25:09 +0100 Subject: datacite: limit abstract length --- python/fatcat_tools/importers/datacite.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 26520164..66f812e2 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -16,6 +16,10 @@ import sqlite3 import sys from fatcat_tools.transforms import entity_to_dict + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 + # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { 'Journal': 'journal', @@ -450,6 +454,8 @@ class DataciteImporter(EntityImporter): if len(desc.get('description', '')) < 10: continue text = desc.get('description') + if len(text) > MAX_ABSTRACT_LENGTH: + text = text[:MAX_ABSTRACT_LENGTH] + " [...]" lang = None if self.lang_detect: try: -- cgit v1.2.3