diff options
author | bnewbold <bnewbold@archive.org> | 2021-10-02 00:55:56 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2021-10-02 00:55:56 +0000 |
commit | 491722e00548888e24fba6ec87d7fefa92e3538b (patch) | |
tree | 1b8c4ba4c23edb299fef488c346b7d2565bb9834 | |
parent | 519c7e77cf3a54b9620adef07fedac9b37a5f9f2 (diff) | |
parent | bdc4347acbbdb9f58b7c3abc2578a488de3d0a85 (diff) | |
download | fatcat-491722e00548888e24fba6ec87d7fefa92e3538b.tar.gz fatcat-491722e00548888e24fba6ec87d7fefa92e3538b.zip |
Merge branch 'martin-datacite-emtpy-abstract-sentry-94639' into 'master'
datacite: skip empty abstracts
See merge request webgroup/fatcat!119
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 5 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_doc_36.json | 65 | ||||
-rw-r--r-- | python/tests/files/datacite/datacite_result_36.json | 25 | ||||
-rw-r--r-- | python/tests/import_datacite.py | 2 |
4 files changed, 95 insertions, 2 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 703dbc27..eb49596f 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -549,10 +549,13 @@ class DataciteImporter(EntityImporter): lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + abstract_text = clean(text) + if not abstract_text: + continue abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", - content=clean(text), + content=abstract_text, lang=lang, )) diff --git a/python/tests/files/datacite/datacite_doc_36.json b/python/tests/files/datacite/datacite_doc_36.json new file mode 100644 index 00000000..66aba00c --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_36.json @@ -0,0 +1,65 @@ +{ + "id": "10.17912/micropub.biology.000143", + "type": "dois", + "attributes": { + "doi": "10.17912/micropub.biology.000143", + "identifiers": null, + "creators": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "author" + } + ], + "titles": [ + { + "lang": "da" + }, + { + "title": "Sample" + } + ], + "publisher": "microPublication Biology", + "publicationYear": 2019, + "types": { + "resourceTypeGeneral": "DataPaper" + }, + "relatedIdentifiers": [], + "sizes": [], + "formats": [], + "version": null, + "rightsList": [], + "descriptions": [ + { + "description": " ", + "descriptionType": "Abstract" + } + ], + "geoLocations": [], + "fundingReferences": [], + "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", + "created": "2019-08-19T14:43:08.000Z", + "registered": "2019-08-19T14:43:09.000Z", + "published": "2019", + "updated": "2019-11-09T12:32:02.000Z", + "contributors": [ + { + "name": "Paul Katz", + "givenName": "", + "familyName": "", + "affiliation": [], + "role": "illustrator" + } + ] + }, + "relationships": { + "client": { + "data": { + "id": "caltech.micropub", + "type": "clients" + } + } + } +} diff --git a/python/tests/files/datacite/datacite_result_36.json b/python/tests/files/datacite/datacite_result_36.json new file mode 100644 index 00000000..8c958848 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_36.json @@ -0,0 +1,25 @@ +{ + "abstracts": [], + "contribs": [ + { + "given_name": "", + "index": 0, + "raw_name": "Paul Katz", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.17912/micropub.biology.000143" + }, + "extra": { + "datacite": { + "resourceTypeGeneral": "DataPaper" + }, + "container_name": "microPublication Biology" + }, + "refs": [], + "release_stage": "published", + "release_year": 2019, + "publisher": "microPublication Biology", + "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 8b6797ef..edbb6617 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -400,7 +400,7 @@ def test_datacite_conversions(datacite_importer): for now. """ datacite_importer.debug = True - for i in range(36): + for i in range(37): src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i) dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i) with open(src, "r") as f: |