diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-26 17:25:09 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:32 +0100 | 
| commit | 097fa7660c60e6c52ac2adbdd82fe64c122b1e42 (patch) | |
| tree | 586ddf2fe3ef027cb9adca6122454baf79c5d42e | |
| parent | a57919b05d8b1f24041713e85b7fa4322c0591c6 (diff) | |
| download | fatcat-097fa7660c60e6c52ac2adbdd82fe64c122b1e42.tar.gz fatcat-097fa7660c60e6c52ac2adbdd82fe64c122b1e42.zip | |
datacite: limit abstract length
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 6 | 
1 files changed, 6 insertions, 0 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 26520164..66f812e2 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -16,6 +16,10 @@ import sqlite3  import sys  from fatcat_tools.transforms import entity_to_dict + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 +  # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary  CONTAINER_TYPE_MAP = {      'Journal': 'journal', @@ -450,6 +454,8 @@ class DataciteImporter(EntityImporter):              if len(desc.get('description', '')) < 10:                  continue              text = desc.get('description') +            if len(text) > MAX_ABSTRACT_LENGTH: +                text = text[:MAX_ABSTRACT_LENGTH] + " [...]"              lang = None              if self.lang_detect:                  try: | 
