diff options
| author | bnewbold <bnewbold@archive.org> | 2021-10-02 00:55:56 +0000 | 
|---|---|---|
| committer | bnewbold <bnewbold@archive.org> | 2021-10-02 00:55:56 +0000 | 
| commit | 491722e00548888e24fba6ec87d7fefa92e3538b (patch) | |
| tree | 1b8c4ba4c23edb299fef488c346b7d2565bb9834 | |
| parent | 519c7e77cf3a54b9620adef07fedac9b37a5f9f2 (diff) | |
| parent | bdc4347acbbdb9f58b7c3abc2578a488de3d0a85 (diff) | |
| download | fatcat-491722e00548888e24fba6ec87d7fefa92e3538b.tar.gz fatcat-491722e00548888e24fba6ec87d7fefa92e3538b.zip | |
Merge branch 'martin-datacite-emtpy-abstract-sentry-94639' into 'master'
datacite: skip empty abstracts
See merge request webgroup/fatcat!119
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 5 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_doc_36.json | 65 | ||||
| -rw-r--r-- | python/tests/files/datacite/datacite_result_36.json | 25 | ||||
| -rw-r--r-- | python/tests/import_datacite.py | 2 | 
4 files changed, 95 insertions, 2 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 703dbc27..eb49596f 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -549,10 +549,13 @@ class DataciteImporter(EntityImporter):                  lang = langdetect.detect(text)              except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:                  print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) +            abstract_text = clean(text) +            if not abstract_text: +                continue              abstracts.append(                  fatcat_openapi_client.ReleaseAbstract(                      mimetype="text/plain", -                    content=clean(text), +                    content=abstract_text,                      lang=lang,                  )) diff --git a/python/tests/files/datacite/datacite_doc_36.json b/python/tests/files/datacite/datacite_doc_36.json new file mode 100644 index 00000000..66aba00c --- /dev/null +++ b/python/tests/files/datacite/datacite_doc_36.json @@ -0,0 +1,65 @@ +{ +  "id": "10.17912/micropub.biology.000143", +  "type": "dois", +  "attributes": { +    "doi": "10.17912/micropub.biology.000143", +    "identifiers": null, +    "creators": [ +      { +        "name": "Paul Katz", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "author" +      } +    ], +    "titles": [ +      { +        "lang": "da" +      }, +      { +        "title": "Sample" +      } +    ], +    "publisher": "microPublication Biology", +    "publicationYear": 2019, +    "types": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "relatedIdentifiers": [], +    "sizes": [], +    "formats": [], +    "version": null, +    "rightsList": [], +    "descriptions": [ +      { +        "description": "                 ", +        "descriptionType": "Abstract" +      } +    ], +    "geoLocations": [], +    "fundingReferences": [], +    "url": "https://www.micropublication.org/journals/biology/micropub.biology.000143", +    "created": "2019-08-19T14:43:08.000Z", +    "registered": "2019-08-19T14:43:09.000Z", +    "published": "2019", +    "updated": "2019-11-09T12:32:02.000Z", +    "contributors": [ +      { +        "name": "Paul Katz", +        "givenName": "", +        "familyName": "", +        "affiliation": [], +        "role": "illustrator" +      } +    ] +  }, +  "relationships": { +    "client": { +      "data": { +        "id": "caltech.micropub", +        "type": "clients" +      } +    } +  } +} diff --git a/python/tests/files/datacite/datacite_result_36.json b/python/tests/files/datacite/datacite_result_36.json new file mode 100644 index 00000000..8c958848 --- /dev/null +++ b/python/tests/files/datacite/datacite_result_36.json @@ -0,0 +1,25 @@ +{ +  "abstracts": [], +  "contribs": [ +    { +      "given_name": "", +      "index": 0, +      "raw_name": "Paul Katz", +      "role": "author" +    } +  ], +  "ext_ids": { +    "doi": "10.17912/micropub.biology.000143" +  }, +  "extra": { +    "datacite": { +      "resourceTypeGeneral": "DataPaper" +    }, +    "container_name": "microPublication Biology" +  }, +  "refs": [], +  "release_stage": "published", +  "release_year": 2019, +  "publisher": "microPublication Biology", +  "title": "Sample" +} diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py index 8b6797ef..edbb6617 100644 --- a/python/tests/import_datacite.py +++ b/python/tests/import_datacite.py @@ -400,7 +400,7 @@ def test_datacite_conversions(datacite_importer):      for now.      """      datacite_importer.debug = True -    for i in range(36): +    for i in range(37):          src = "tests/files/datacite/datacite_doc_{0:02d}.json".format(i)          dst = "tests/files/datacite/datacite_result_{0:02d}.json".format(i)          with open(src, "r") as f: | 
