diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-09 15:36:09 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-09 15:36:09 +0100 |
commit | 6a7591103c7b7d985ad22199138af9378de697f4 (patch) | |
tree | 8834991b7d70676972f7bc4f3e347d3051a2e82d /python | |
parent | ffd2597d5e962e3f3a2ea23c66a135bb737b2390 (diff) | |
download | fatcat-6a7591103c7b7d985ad22199138af9378de697f4.tar.gz fatcat-6a7591103c7b7d985ad22199138af9378de697f4.zip |
datacite: abstracts may be strings or list of strings
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 17 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_27.json | 60 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_28.json | 60 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_27.json | 33 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_28.json | 33 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 2 |
6 files changed, 202 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 2f1e17d1..4128b3ca 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -468,11 +468,24 @@ class DataciteImporter(EntityImporter): for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue - if len(desc.get('description', '') or '') < 10: - continue + + # Description maybe a string or list. text = desc.get('description', '') + if not text: + continue + if isinstance(text, list): + try: + text = "\n".join(text) + except TypeError as err: + continue # Bail out, if it is not a list of strings. + + # Limit length. + if len(text) < 10: + continue if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" + + # Detect language. lang = None try: lang = langdetect.detect(text) diff --git a/python/tests/files/datacite/datacite_doc_27.json b/python/tests/files/datacite/datacite_doc_27.json new file mode 100644 index 00000000..ff9c00f4 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_27.json @@ -0,0 +1,60 @@ +{ + "attributes": { + "contributors": [ + { + "affiliation": [], + "contributorType": "Editor", + "familyName": "Wemmer", + "givenName": "David", + "name": "Wemmer, David", + "nameType": "Personal" + } + ], + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [ + {"description": "Hello World", "descriptionType": "Abstract"} + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_doc_28.json b/python/tests/files/datacite/datacite_doc_28.json new file mode 100644 index 00000000..0d03ecc8 --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_28.json @@ -0,0 +1,60 @@ +{ + "attributes": { + "contributors": [ + { + "affiliation": [], + "contributorType": "Editor", + "familyName": "Wemmer", + "givenName": "David", + "name": "Wemmer, David", + "nameType": "Personal" + } + ], + "creators": [ + { + "affiliation": [ + "Department of pataphysics" + ], + "name": "Anton Welch", + "nameIdentifiers": [] + } + ], + "dates": [ + { + "date": "2017-08-24", + "dateType": "Created" + }, + { + "date": "2019-08-04", + "dateType": "Updated" + }, + { + "date": "2017", + "dateType": "Issued" + } + ], + "descriptions": [ + {"description": ["Hello", "World"], "descriptionType": "Abstract"} + ], + "doi": "10.7916/d86x0cg1", + "isActive": true, + "language": "DE-CH", + "publicationYear": 2016, + "state": "findable", + "titles": [ + { + "title": "Additional file 123: ABC" + }, + { + "title": "DEF", + "titleType": "Subtitle" + } + ], + "types": { + "bibtex": "misc", + "citeproc": "article", + "ris": "GEN", + "schemaOrg": "CreativeWork" + } + } +} diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json new file mode 100644 index 00000000..3d033e6a --- /dev/null +++ b/python/tests/files/datacite/datacite_result_27.json @@ -0,0 +1,33 @@ +{ + "abstracts": [{"content": "Hello World", "lang": "en", "mimetype": "text/plain"}], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + }, + { + "extra": { + "type": "Editor" + }, + "given_name": "David", + "raw_name": "David Wemmer", + "surname": "Wemmer" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json new file mode 100644 index 00000000..84bed9c8 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_28.json @@ -0,0 +1,33 @@ +{ + "abstracts": [{"content": "Hello\nWorld", "lang": "en", "mimetype": "text/plain"}], + "contribs": [ + { + "index": 0, + "raw_affiliation": "Department of pataphysics", + "raw_name": "Anton Welch", + "role": "author" + }, + { + "extra": { + "type": "Editor" + }, + "given_name": "David", + "raw_name": "David Wemmer", + "surname": "Wemmer" + } + ], + "ext_ids": { + "doi": "10.7916/d86x0cg1" + }, + "extra": { + "datacite": {}, + "release_month": 8 + }, + "refs": [], + "release_date": "2017-08-24", + "release_stage": "published", + "release_type": "stub", + "release_year": 2017, + "subtitle": "DEF", + "title": "Additional file 123: ABC" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 89671cd2..a7d514ea 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(27): + for i in range(29): src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i) dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i) print('testing mapping from {} => {}'.format(src, dst)) |