aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2019-12-16 19:32:54 +0100
committerMartin Czygan <martin.czygan@gmail.com>2019-12-28 23:07:31 +0100
commit68a051abc45103f21284163d13c8893c31b4e8e4 (patch)
tree0868460593bc3b86cac8e146d841513859ccf96d /python
parent4a82a0763bf927248f22e47ab5187af4beff83ee (diff)
downloadfatcat-68a051abc45103f21284163d13c8893c31b4e8e4.tar.gz
fatcat-68a051abc45103f21284163d13c8893c31b4e8e4.zip
datacite: basic field mappings
Currently using two external libraries: * dateparser * langcodes Note: This commit includes lots of wip docs and field stat in comment, which should be removed.
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/importers/datacite.py222
1 files changed, 181 insertions, 41 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index faa8e2be..e486ba90 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,6 +6,7 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
from .common import EntityImporter
import dateparser
+import langcodes
import datetime
import fatcat_openapi_client
import json
@@ -18,36 +19,132 @@ CONTAINER_TYPE_MAP = {
'Book Series': 'book-series',
}
+# The docs/guide should be the cannonical home for these mappings; update there
+# first.
+#
+# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
+#
+# count | release_type
+# ----------+-------------------
+# 95030004 | article-journal
+# 13477878 | chapter
+# 5926811 | paper-conference
+# 2169642 | article
+# 1806415 | dataset
+# 1548614 | book
+# 1390304 |
+# 818351 | report
+# 815684 | entry
+# 307998 | standard
+# 297769 | thesis
+# 261426 | letter
+# 148093 | post
+# 122736 | editorial
+# 99225 | stub
+# 96219 | review-book
+# 22854 | peer_review
+# 19078 | interview
+# 16278 | article-newspaper
+# 3973 | speech
+# 3536 | legal_case
+# 2264 | abstract
+# 1626 | legislation
+# 1053 | retraction
+# 85 | component
+# (25 rows)
+#
+# Map various datacite type types to CSL-ish types. None means TODO or remove.
+DATACITE_TYPE_MAP = {
+ 'ris': {
+ 'THES': 'thesis',
+ 'SOUND': None,
+ 'CHAP': 'chapter',
+ 'FIGURE': None,
+ 'RPRT': 'report',
+ 'JOUR': 'article-journal',
+ 'MPCT': None,
+ 'GEN': None,
+ 'BOOK': 'book',
+ 'DATA': 'dataset',
+ 'COMP': None,
+ },
+ 'schemaOrg': {
+ 'Dataset': 'dataset',
+ 'Book': 'book',
+ 'ScholarlyArticle': 'article',
+ 'ImageObject': 'graphic',
+ 'Collection': None,
+ 'MediaObject': None,
+ 'Event': None,
+ 'SoftwareSourceCode': None,
+ 'Chapter': 'chapter',
+ 'CreativeWork': None,
+ 'PublicationIssue': 'article',
+ 'AudioObject': None,
+ 'Thesis': 'thesis',
+ },
+ 'citeproc': {
+ 'dataset': 'dataset',
+ 'chapter': 'chapter',
+ 'article-journal': 'article-journal',
+ 'song': 'song',
+ 'article': 'article',
+ 'report': 'report',
+ 'graphic': 'graphic',
+ 'thesis': 'thesis',
+ 'book': 'book',
+ },
+ 'bibtex': {
+ 'phdthesis': 'thesis',
+ 'inbook': 'chapter',
+ 'misc': None,
+ 'article': 'article-journal',
+ 'book': 'book',
+ },
+ 'resourceTypeGeneral': {
+ 'Image': None,
+ 'Dataset': 'dataset',
+ 'PhysicalObject': None,
+ 'Collection': None,
+ 'Text': None,
+ 'Sound': None,
+ 'InteractiveResource': None,
+ 'Event': None,
+ 'Software': None,
+ 'Other': None,
+ 'Workflow': None,
+ 'Audiovisual': None,
+ }
+}
+
+
# TODO(martin): merge this with other maps, maybe.
LICENSE_SLUG_MAP = {
- "//creativecommons.org/licenses/by/2.0": "CC-BY",
+ "//creativecommons.org/licenses/by/2.0/": "CC-BY",
"//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
- "//creativecommons.org/licenses/by/3.0": "CC-BY",
+ "//creativecommons.org/licenses/by/3.0/": "CC-BY",
"//creativecommons.org/licenses/by/3.0/us": "CC-BY",
- "//creativecommons.org/licenses/by/4.0": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/deed.de": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/deed.en_US": "CC-BY",
- "//creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
- "//creativecommons.org/licenses/by-nc/2.0": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+ "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY",
+ "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY",
+ "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+ "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
"//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
- "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+ "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
"//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
- "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND",
"//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
- "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
- "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-SA",
- "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+ "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+ "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
"//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
"//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
- "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+ "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
"//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
- "//creativecommons.org/licenses/CC-BY/4.0": "CC-BY",
- "//creativecommons.org/licenses/publicdomain/zero/1.0": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
- "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+ "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY",
+ "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0",
+ "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
"//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
"//opensource.org/licenses/MIT": "MIT",
"//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
@@ -75,6 +172,7 @@ LICENSE_SLUG_MAP = {
# "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
# "info:eu-repo/semantics/embargoedAccess": "",
# "info:eu-repo/semantics/openAccess": "",
+ # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
}
class DataciteImporter(EntityImporter):
@@ -302,12 +400,12 @@ class DataciteImporter(EntityImporter):
# "identifierType": "ISSN"
# },
#
- # "attributes.container.type": [
- # "DataRepository",
- # "Journal",
- # "Series",
- # "Book Series"
- # ],
+ # "attributes.container.type": [
+ # "DataRepository",
+ # "Journal",
+ # "Series",
+ # "Book Series"
+ # ],
#
# "attributes.container.identifierType": [
# "Handle",
@@ -318,6 +416,7 @@ class DataciteImporter(EntityImporter):
# "URL",
# "ISSN"
# ],
+ #
container_id = None
container = attributes.get('container', {}) or {}
@@ -328,17 +427,18 @@ class DataciteImporter(EntityImporter):
if len(issn) == 8:
issn = issn[:4] + "-" + issn[4:]
issnl = self.issn2issnl(issn)
- container_id = self.lookup_issnl(issnl)
-
- if container_id is None and container.get('title'):
- ce = fatcat_openapi_client.ContainerEntity(
- issnl=issnl,
- container_type=container_type,
- name=container.get('title'),
- )
- ce_edit = self.create_container(ce)
- container_id = ce_edit.ident
- self._issnl_id_map[issnl] = container_id
+ if issnl is not None:
+ container_id = self.lookup_issnl(issnl)
+
+ if container_id is None and container.get('title'):
+ ce = fatcat_openapi_client.ContainerEntity(
+ issnl=issnl,
+ container_type=container_type,
+ name=container.get('title'),
+ )
+ ce_edit = self.create_container(ce)
+ container_id = ce_edit.ident
+ self._issnl_id_map[issnl] = container_id
# > License
#
@@ -376,11 +476,51 @@ class DataciteImporter(EntityImporter):
# "Workflow",
# "Audiovisual"
# ],
+ # "attributes.types.citeproc": [
+ # "dataset",
+ # "chapter",
+ # "article-journal",
+ # "song",
+ # "article",
+ # "report",
+ # "graphic",
+ # "thesis",
+ # "book"
+ # ],
+ #
+ # There is RIS, also.
+
+ # attributes.types.resourceType contains too many things for now.
+ for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
+ release_type = attributes.get('types', {}).get(typeType)
+ if release_type is not None:
+ break
+
+ # TODO(martin): Skip unmapped release_type entirely?
+ if release_type is None:
+ print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+
+ # > Language.
+ # attributes.language
+
+ language = None
+ value = attributes.get('language', '') or '' # As it is written.
+ try:
+ language = langcodes.find(value).language
+ except LookupError:
+ try:
+ language = langcodes.get(value).language
+ except langcodes.tag_parser.LanguageTagError:
+ pass
- # > Extra information.
+ # > Extra information: license, subjects, ...
extra, extra_datacite = dict(), dict()
if license_extra:
- extra_datacite['license'] = license_extra
+ extra_datacite = {
+ 'license': license_extra,
+ }
+ if attributes.get('subjects'):
+ extra_datacite['subjects'] = attributes.get('subjects', [])
if extra_datacite:
extra['datacite'] = extra_datacite
@@ -389,7 +529,7 @@ class DataciteImporter(EntityImporter):
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
container_id=container_id,
- release_type=None,
+ release_type=release_type,
release_stage=None,
title=title, # attributes.titles, various titleType
subtitle=subtitle,
@@ -405,7 +545,7 @@ class DataciteImporter(EntityImporter):
volume=None,
issue=None,
pages=None,
- language=None,
+ language=language,
abstracts=None,
refs=None,
extra=extra,