aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-01-09 15:36:09 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-01-09 15:36:09 +0100
commit6a7591103c7b7d985ad22199138af9378de697f4 (patch)
tree8834991b7d70676972f7bc4f3e347d3051a2e82d
parentffd2597d5e962e3f3a2ea23c66a135bb737b2390 (diff)
downloadfatcat-6a7591103c7b7d985ad22199138af9378de697f4.tar.gz
fatcat-6a7591103c7b7d985ad22199138af9378de697f4.zip
datacite: abstracts may be strings or list of strings
-rw-r--r--python/fatcat_tools/importers/datacite.py17
-rw-r--r--python/tests/files/datacite/datacite_doc_27.json60
-rw-r--r--python/tests/files/datacite/datacite_doc_28.json60
-rw-r--r--python/tests/files/datacite/datacite_result_27.json33
-rw-r--r--python/tests/files/datacite/datacite_result_28.json33
-rw-r--r--python/tests/import_datacite.py2
6 files changed, 202 insertions, 3 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 2f1e17d1..4128b3ca 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -468,11 +468,24 @@ class DataciteImporter(EntityImporter):
for desc in descs:
if not desc.get('descriptionType') == 'Abstract':
continue
- if len(desc.get('description', '') or '') < 10:
- continue
+
+ # Description maybe a string or list.
text = desc.get('description', '')
+ if not text:
+ continue
+ if isinstance(text, list):
+ try:
+ text = "\n".join(text)
+ except TypeError as err:
+ continue # Bail out, if it is not a list of strings.
+
+ # Limit length.
+ if len(text) < 10:
+ continue
if len(text) > MAX_ABSTRACT_LENGTH:
text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
+
+ # Detect language.
lang = None
try:
lang = langdetect.detect(text)
diff --git a/python/tests/files/datacite/datacite_doc_27.json b/python/tests/files/datacite/datacite_doc_27.json
new file mode 100644
index 00000000..ff9c00f4
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_27.json
@@ -0,0 +1,60 @@
+{
+ "attributes": {
+ "contributors": [
+ {
+ "affiliation": [],
+ "contributorType": "Editor",
+ "familyName": "Wemmer",
+ "givenName": "David",
+ "name": "Wemmer, David",
+ "nameType": "Personal"
+ }
+ ],
+ "creators": [
+ {
+ "affiliation": [
+ "Department of pataphysics"
+ ],
+ "name": "Anton Welch",
+ "nameIdentifiers": []
+ }
+ ],
+ "dates": [
+ {
+ "date": "2017-08-24",
+ "dateType": "Created"
+ },
+ {
+ "date": "2019-08-04",
+ "dateType": "Updated"
+ },
+ {
+ "date": "2017",
+ "dateType": "Issued"
+ }
+ ],
+ "descriptions": [
+ {"description": "Hello World", "descriptionType": "Abstract"}
+ ],
+ "doi": "10.7916/d86x0cg1",
+ "isActive": true,
+ "language": "DE-CH",
+ "publicationYear": 2016,
+ "state": "findable",
+ "titles": [
+ {
+ "title": "Additional file 123: ABC"
+ },
+ {
+ "title": "DEF",
+ "titleType": "Subtitle"
+ }
+ ],
+ "types": {
+ "bibtex": "misc",
+ "citeproc": "article",
+ "ris": "GEN",
+ "schemaOrg": "CreativeWork"
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_doc_28.json b/python/tests/files/datacite/datacite_doc_28.json
new file mode 100644
index 00000000..0d03ecc8
--- /dev/null
+++ b/python/tests/files/datacite/datacite_doc_28.json
@@ -0,0 +1,60 @@
+{
+ "attributes": {
+ "contributors": [
+ {
+ "affiliation": [],
+ "contributorType": "Editor",
+ "familyName": "Wemmer",
+ "givenName": "David",
+ "name": "Wemmer, David",
+ "nameType": "Personal"
+ }
+ ],
+ "creators": [
+ {
+ "affiliation": [
+ "Department of pataphysics"
+ ],
+ "name": "Anton Welch",
+ "nameIdentifiers": []
+ }
+ ],
+ "dates": [
+ {
+ "date": "2017-08-24",
+ "dateType": "Created"
+ },
+ {
+ "date": "2019-08-04",
+ "dateType": "Updated"
+ },
+ {
+ "date": "2017",
+ "dateType": "Issued"
+ }
+ ],
+ "descriptions": [
+ {"description": ["Hello", "World"], "descriptionType": "Abstract"}
+ ],
+ "doi": "10.7916/d86x0cg1",
+ "isActive": true,
+ "language": "DE-CH",
+ "publicationYear": 2016,
+ "state": "findable",
+ "titles": [
+ {
+ "title": "Additional file 123: ABC"
+ },
+ {
+ "title": "DEF",
+ "titleType": "Subtitle"
+ }
+ ],
+ "types": {
+ "bibtex": "misc",
+ "citeproc": "article",
+ "ris": "GEN",
+ "schemaOrg": "CreativeWork"
+ }
+ }
+}
diff --git a/python/tests/files/datacite/datacite_result_27.json b/python/tests/files/datacite/datacite_result_27.json
new file mode 100644
index 00000000..3d033e6a
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_27.json
@@ -0,0 +1,33 @@
+{
+ "abstracts": [{"content": "Hello World", "lang": "en", "mimetype": "text/plain"}],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_affiliation": "Department of pataphysics",
+ "raw_name": "Anton Welch",
+ "role": "author"
+ },
+ {
+ "extra": {
+ "type": "Editor"
+ },
+ "given_name": "David",
+ "raw_name": "David Wemmer",
+ "surname": "Wemmer"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.7916/d86x0cg1"
+ },
+ "extra": {
+ "datacite": {},
+ "release_month": 8
+ },
+ "refs": [],
+ "release_date": "2017-08-24",
+ "release_stage": "published",
+ "release_type": "stub",
+ "release_year": 2017,
+ "subtitle": "DEF",
+ "title": "Additional file 123: ABC"
+}
diff --git a/python/tests/files/datacite/datacite_result_28.json b/python/tests/files/datacite/datacite_result_28.json
new file mode 100644
index 00000000..84bed9c8
--- /dev/null
+++ b/python/tests/files/datacite/datacite_result_28.json
@@ -0,0 +1,33 @@
+{
+ "abstracts": [{"content": "Hello\nWorld", "lang": "en", "mimetype": "text/plain"}],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_affiliation": "Department of pataphysics",
+ "raw_name": "Anton Welch",
+ "role": "author"
+ },
+ {
+ "extra": {
+ "type": "Editor"
+ },
+ "given_name": "David",
+ "raw_name": "David Wemmer",
+ "surname": "Wemmer"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.7916/d86x0cg1"
+ },
+ "extra": {
+ "datacite": {},
+ "release_month": 8
+ },
+ "refs": [],
+ "release_date": "2017-08-24",
+ "release_stage": "published",
+ "release_type": "stub",
+ "release_year": 2017,
+ "subtitle": "DEF",
+ "title": "Additional file 123: ABC"
+}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 89671cd2..a7d514ea 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -287,7 +287,7 @@ def test_datacite_conversions(datacite_importer):
for now.
"""
datacite_importer.debug = True
- for i in range(27):
+ for i in range(29):
src = 'tests/files/datacite/datacite_doc_{0:02d}.json'.format(i)
dst = 'tests/files/datacite/datacite_result_{0:02d}.json'.format(i)
print('testing mapping from {} => {}'.format(src, dst))