From 6a7591103c7b7d985ad22199138af9378de697f4 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 9 Jan 2020 15:36:09 +0100 Subject: datacite: abstracts may be strings or list of strings --- python/fatcat_tools/importers/datacite.py | 17 +++++- python/tests/files/datacite/datacite_doc_27.json | 60 ++++++++++++++++++++++ python/tests/files/datacite/datacite_doc_28.json | 60 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_27.json | 33 ++++++++++++ .../tests/files/datacite/datacite_result_28.json | 33 ++++++++++++ python/tests/import_datacite.py | 2 +- 6 files changed, 202 insertions(+), 3 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_27.json create mode 100644 python/tests/files/datacite/datacite_doc_28.json create mode 100644 python/tests/files/datacite/datacite_result_27.json create mode 100644 python/tests/files/datacite/datacite_result_28.json diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f1e17d1..4128b3ca 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -468,11 +468,24 @@ class DataciteImporter(EntityImporter): for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue - if len(desc.get('description', '') or '') < 10: - continue + + # Description maybe a string or list. text = desc.get('description', '') + if not text: + continue + if isinstance(text, list): + try: + text = "\n".join(text) + except TypeError as err: + continue # Bail out, if it is not a list of strings. + + # Limit length. + if len(text) < 10: + continue if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" + + # Detect language. lang = None try: lang = langdetect.detect(text) diff --git a/python/tests/files/datacite/datacite_doc_27.json b/python/tests/files/datacite/datacite_doc_27.json new file mode 100644 index 00000000..ff9c00f4 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_27.json @@ -0,0 +1,60 @@ +{ + "attributes": { + "contributors": [ + { + "affiliation": [], + "contributorType": "Editor", + "familyName": "Wemmer", + "givenName": "David", + "name": "Wemmer, David", + "nameType": "Personal" + } + ], + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [ + {"description": "Hello World", "descriptionType": "Abstract"} + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_28.json b/python/tests/files/datacite/datacite_doc_28.json new file mode 100644 index 00000000..0d03ecc8 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_28.json @@ -0,0 +1,60 @@ +{ + "attributes": { + "contributors": [ + { + "affiliation": [], + "contributorType": "Editor", + "familyName": "Wemmer", + "givenName": "David", + "name": "Wemmer, David", + "nameType": "Personal" + } + ], + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [ + {"description": ["Hello", "World"], "descriptionType": "Abstract"} + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json new file mode 100644 index 00000000..3d033e6a --- /dev/null +++ b/python/tests/files/datacite/datacite_result_27.json @@ -0,0 +1,33 @@ +{ + "abstracts": [{"content": "Hello World", "lang": "en", "mimetype": "text/plain"}], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + }, + { + "extra": { + "type": "Editor" + }, + "given_name": "David", + "raw_name": "David Wemmer", + "surname": "Wemmer" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json new file mode 100644 index 00000000..84bed9c8 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_28.json @@ -0,0 +1,33 @@ +{ + "abstracts": [{"content": "Hello\nWorld", "lang": "en", "mimetype": "text/plain"}], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + }, + { + "extra": { + "type": "Editor" + }, + "given_name": "David", + "raw_name": "David Wemmer", + "surname": "Wemmer" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 89671cd2..a7d514ea 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(27): + for i in range(29): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) -- cgit v1.2.3