From bdc4347acbbdb9f58b7c3abc2578a488de3d0a85 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 1 Oct 2021 16:56:59 +0200 Subject: datacite: skip empty abstracts Do not add abstracts where `clean` results in the empty string - this violates a constraint: `either abstract_sha1 or content is required` --- python/fatcat_tools/importers/datacite.py | 5 +- python/tests/files/datacite/datacite_doc_36.json | 65 ++++++++++++++++++++++ .../tests/files/datacite/datacite_result_36.json | 25 +++++++++ python/tests/import_datacite.py | 2 +- 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 python/tests/files/datacite/datacite_doc_36.json create mode 100644 python/tests/files/datacite/datacite_result_36.json (limited to 'python') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 703dbc27..eb49596f 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -549,10 +549,13 @@ class DataciteImporter(EntityImporter): lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + abstract_text = clean(text) + if not abstract_text: + continue abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", - content=clean(text), + content=abstract_text, lang=lang, )) diff --git a/python/tests/files/datacite/datacite_doc_36.json b/python/tests/files/datacite/datacite_doc_36.json new file mode 100644 index 00000000..66aba00c --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_36.json @@ -0,0 +1,65 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "lang": "da" + }, + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": " ", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "illustrator" + } + ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_36.json b/python/tests/files/datacite/datacite_result_36.json new file mode 100644 index 00000000..8c958848 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_36.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "", + "index": 0, + "raw_name": "Paul Katz", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 8b6797ef..edbb6617 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -400,7 +400,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(36): + for i in range(37): src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) with open(src, "r") as f: -- cgit v1.2.3