summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/importers/datacite.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/fatcat_tools/importers/datacite.py')
-rw-r--r--python/fatcat_tools/importers/datacite.py441
1 files changed, 175 insertions, 266 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index e486ba90..4e117dde 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -1,5 +1,5 @@
"""
-WIP: Importer for datacite.org data.
+Prototype Importer for datacite.org data.
Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
"""
@@ -8,9 +8,11 @@ from .common import EntityImporter
import dateparser
import langcodes
import datetime
+import langdetect
import fatcat_openapi_client
import json
import sys
+import hashlib
# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
CONTAINER_TYPE_MAP = {
@@ -20,40 +22,8 @@ CONTAINER_TYPE_MAP = {
}
# The docs/guide should be the cannonical home for these mappings; update there
-# first.
-#
-# > select count(*), release_type from release_rev group by release_type order by count(*) desc;
-#
-# count | release_type
-# ----------+-------------------
-# 95030004 | article-journal
-# 13477878 | chapter
-# 5926811 | paper-conference
-# 2169642 | article
-# 1806415 | dataset
-# 1548614 | book
-# 1390304 |
-# 818351 | report
-# 815684 | entry
-# 307998 | standard
-# 297769 | thesis
-# 261426 | letter
-# 148093 | post
-# 122736 | editorial
-# 99225 | stub
-# 96219 | review-book
-# 22854 | peer_review
-# 19078 | interview
-# 16278 | article-newspaper
-# 3973 | speech
-# 3536 | legal_case
-# 2264 | abstract
-# 1626 | legislation
-# 1053 | retraction
-# 85 | component
-# (25 rows)
-#
-# Map various datacite type types to CSL-ish types. None means TODO or remove.
+# first. Map various datacite type types to CSL-ish types. None means TODO or
+# remove.
DATACITE_TYPE_MAP = {
'ris': {
'THES': 'thesis',
@@ -197,91 +167,17 @@ class DataciteImporter(EntityImporter):
def parse_record(self, obj):
"""
- TODO(martin): Map datacite to RE.
-
- WIP, notes:
-
- * Many subjects, should they end up in extra?
- * attributes.creators and attributes.contributors
-
- $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
- 3963663 "Personal"
- 289795 null
- 8892 "Organizational"
-
- Shall we use issued, available?
-
- {
- "date": "2011-11-18",
- "dateType": "Accepted"
- },
- {
- "date": "2011-11-18",
- "dateType": "Available"
- },
- {
- "date": "2011-11-07",
- "dateType": "Copyrighted"
- },
- {
- "date": "2011-11-18",
- "dateType": "Issued"
- },
- {
- "date": "2011-11-07",
- "dateType": "Issued"
- }
-
- TODO(martin): Quick analysis of dates and stages.
+ Mapping datacite JSON to ReleaseEntity.
"""
-
if 'attributes' not in obj:
return None
attributes = obj['attributes']
- # > Contributors
- #
- # "attributes.creators[].contributorType": [
- # "author"
- # ],
- # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
- # "LCNA",
- # "GND",
- # "email",
- # "NAF",
- # "OSF",
- # "RRID",
- # "ORCID",
- # "SCOPUS",
- # "NRCPID",
- # "schema.org",
- # "GRID",
- # "MGDS",
- # "VIAF",
- # "JACoW-ID"
- # ],
- #
- # "https://orcid.org/0000-0002-9902-738X",
- # "http://jacow.org/JACoW-00001280",
- # "Wiebe_Peter",
- # "https://osf.io/https://osf.io/kjfuy/",
- # "http://www.viaf.org176549220",
- # "2239",
- # "Jeffries_Martin",
- # "https://orcid.org/0000-0002-1493-6630",
- # "0000-0002-6233-612X",
- #
- # "creators": [
- # {
- # "name": "Bögli, Hans",
- # "nameType": "Personal",
- # "givenName": "Hans",
- # "familyName": "Bögli",
- # "affiliation": []
- # }
- # ],
-
+ # Contributors. Many nameIdentifierSchemes, we do not use yet:
+ # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
+ # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
+ # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
contribs = []
for i, c in enumerate(attributes['creators']):
@@ -304,15 +200,8 @@ class DataciteImporter(EntityImporter):
surname=c.get('familyName'),
))
- # > Title
- #
- # "attributes.titles[].titleType": [
- # "AlternativeTitle",
- # "Other",
- # "Subtitle",
- # null,
- # "TranslatedTitle"
- # ],
+ # Title, may come with "attributes.titles[].titleType", like
+ # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
title, subtitle = None, None
for entry in attributes.get('titles', []):
@@ -321,22 +210,13 @@ class DataciteImporter(EntityImporter):
if entry.get('titleType') == 'Subtitle':
subtitle = entry.get('title').strip()
- # > Dates
- #
- # "attributes.dates[].dateType": [
- # "Accepted",
- # "Available"
- # "Collected",
- # "Copyrighted",
- # "Created",
- # "Issued",
- # "Submitted",
- # "Updated",
- # "Valid",
- # ],
- #
- # Different documents have different dates defined. Choose the topmost
- # available from prio list.
+ # Dates. A few internal dates (registered, created, updated) and
+ # published (0..2554). We try to work with typed date list, in
+ # "attributes.dates[].dateType", values: "Accepted", "Available"
+ # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
+ # "Updated", "Valid".
+ release_year, release_date = None, None
+
date_type_prio = (
'Valid',
'Issued',
@@ -348,14 +228,16 @@ class DataciteImporter(EntityImporter):
'Created',
'Updated',
)
-
- release_year, release_date = None, None
for prio in date_type_prio:
dates = attributes.get('dates', []) or [] # Never be None.
for item in dates:
if not item.get('dateType') == prio:
continue
- result = dateparser.parse(item.get('date'))
+ try:
+ result = dateparser.parse(item.get('date'))
+ except TypeError as err:
+ print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
+ continue
if result is None:
# Unparsable date.
continue
@@ -369,56 +251,23 @@ class DataciteImporter(EntityImporter):
continue
break
- # > Publisher
- #
- # A few NA values. A few bogus values.
- #
+ # Publisher. A few NA values. A few bogus values.
publisher = attributes.get('publisher')
- if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
+ if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
publisher = None
if publisher is not None and len(publisher) > 80:
- # Arbitrary magic value, TODO(martin): better heuristic.
- # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
- # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
- # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
- # TODO(martin): log misses.
+ # Arbitrary magic value max length. TODO(martin): better heuristic,
+ # but factored out; first we have to log misses. Example:
+ # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+ # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
+ # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
+ # werden"
publisher = None
- # > Container
- #
- # For the moment, only ISSN as container.
- #
- # "container": {
- # "type": "Journal",
- # "issue": "8",
- # "title": "Angewandte Chemie International Edition",
- # "volume": "57",
- # "lastPage": "2080",
- # "firstPage": "2077",
- # "identifier": "14337851",
- # "identifierType": "ISSN"
- # },
- #
- # "attributes.container.type": [
- # "DataRepository",
- # "Journal",
- # "Series",
- # "Book Series"
- # ],
- #
- # "attributes.container.identifierType": [
- # "Handle",
- # "ISBN",
- # "LISSN",
- # "DOI",
- # "EISSN",
- # "URL",
- # "ISSN"
- # ],
- #
-
+ # Container. For the moment, only ISSN as container.
container_id = None
+
container = attributes.get('container', {}) or {}
if container.get('type') in CONTAINER_TYPE_MAP.keys():
container_type = CONTAINER_TYPE_MAP.get(container['type'])
@@ -440,142 +289,202 @@ class DataciteImporter(EntityImporter):
container_id = ce_edit.ident
self._issnl_id_map[issnl] = container_id
- # > License
- #
- # attributes.rightsList[].rightsUri
- # attributes.rightsList[].rights
- # attributes.rightsList[].lang
- #
+ # Volume and issue.
+ volume = container.get('volume')
+ issue = container.get('issue')
+
+ # Pages.
+ pages = None
+
+ first_page = container.get('firstPage')
+ last_page = container.get('lastPage')
+
+ if first_page and last_page:
+ try:
+ int(first_page) < int(last_page)
+ pages = '{}-{}'.format(first_page, last_page)
+ except ValueError as err:
+ print(err, file=sys.stderr)
+ pass
+
+ if not pages and first_page:
+ pages = first_page
+ # License.
license_slug = None
license_extra = []
+
for l in attributes.get('rightsList', []):
slug = lookup_license_slug(l.get('rightsUri'))
if slug:
license_slug = slug
license_extra.append(l)
- # > Release type.
- #
- # Datacite has some fine granular typing (e.g. "Supplementary
- # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
- #
- # Additional, coarse: resourceTypeGeneral
- #
- # "attributes.types.resourceTypeGeneral": [
- # "Image",
- # "Dataset",
- # "PhysicalObject",
- # "Collection",
- # "Text",
- # "Sound",
- # "InteractiveResource",
- # "Event",
- # "Software",
- # "Other",
- # "Workflow",
- # "Audiovisual"
- # ],
- # "attributes.types.citeproc": [
- # "dataset",
- # "chapter",
- # "article-journal",
- # "song",
- # "article",
- # "report",
- # "graphic",
- # "thesis",
- # "book"
- # ],
- #
- # There is RIS, also.
-
- # attributes.types.resourceType contains too many things for now.
+ # Release type. Try to determine the release type from a variety of
+ # types supplied in datacite. The "attributes.types.resourceType"
+ # contains too many (176 in sample) things for now; citeproc may be the
+ # closest, but not always supplied.
for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
- release_type = attributes.get('types', {}).get(typeType)
+ value = attributes.get('types', {}).get(typeType)
+ release_type = DATACITE_TYPE_MAP.get(value)
if release_type is not None:
break
- # TODO(martin): Skip unmapped release_type entirely?
if release_type is None:
print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
- # > Language.
- # attributes.language
-
+ # Language values are varied ("ger", "es", "English", "ENG", "en-us",
+ # "other", ...). Try to crush it with langcodes: "It may sound to you
+ # like langcodes solves a pretty boring problem. At one level, that's
+ # right. Sometimes you have a boring problem, and it's great when a
+ # library solves it for you." -- TODO(martin): We need more of these.
language = None
- value = attributes.get('language', '') or '' # As it is written.
+
+ value = attributes.get('language', '') or ''
try:
language = langcodes.find(value).language
except LookupError:
try:
language = langcodes.get(value).language
except langcodes.tag_parser.LanguageTagError:
+ print('could not determine language: {}'.format(value), file=sys.stderr)
+
+ # Abstracts appear in "attributes.descriptions[].descriptionType", some
+ # of the observed values: "Methods", "TechnicalInfo",
+ # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
+ # "Other" fields might contain references or related articles (with
+ # DOI). TODO(martin): maybe try to parse out some of those refs.
+ abstracts = []
+
+ for desc in attributes.get('descriptions', []):
+ if not desc.get('descriptionType') == 'Abstract':
+ continue
+ if len(desc.get('description', '')) < 10:
+ continue
+ text = desc.get('description')
+ sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
+ lang = None
+ try:
+ lang = langdetect.detect(text)
+ except langdetect.lang_detect_exception.LangDetectException:
pass
+ abstracts.append(fatcat_openapi_client.ReleaseAbstract(
+ mimetype="text/plain",
+ content=text,
+ sha1=sha1,
+ lang=lang,
+ ))
+
+ # References and relations. Datacite include many relation types in
+ # "attributes.relatedIdentifiers[].relationType", e.g.
+ # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
+ # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
+ # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
+ # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
+ # "IsDerivedFrom", "IsSourceOf".
+ #
+ # For the moment, we only care about References.
+ refs, ref_index = [], 0
+
+ for rel in attributes.get('relatedIdentifiers', []):
+ if not rel.get('relationType') == 'References':
+ continue
+ ref_extra = dict()
+ if rel.get('relatedIdentifierType') == 'DOI':
+ ref_extra['doi'] = rel.get('relatedIdentifier')
+ if not ref_extra:
+ ref_extra = None
+ refs.append(fatcat_openapi_client.ReleaseRef(
+ index=ref_index,
+ extra=ref_extra,
+ ))
+ ref_index += 1
+
+ # Start with clear stages, e.g. published. TODO(martin): we could
+ # probably infer a bit more from the relations, e.g.
+ # "IsPreviousVersionOf" or "IsNewVersionOf".
+ release_stage = None
+ if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+ release_stage = 'published'
+
+ # Extra information.
+ extra_datacite = dict()
- # > Extra information: license, subjects, ...
- extra, extra_datacite = dict(), dict()
if license_extra:
- extra_datacite = {
- 'license': license_extra,
- }
+ extra_datacite['license'] = license_extra
if attributes.get('subjects'):
- extra_datacite['subjects'] = attributes.get('subjects', [])
+ extra_datacite['subjects'] = attributes['subjects']
+ if attributes.get('url'):
+ extra_datacite['url'] = attributes['url']
+
+ extra = dict()
if extra_datacite:
extra['datacite'] = extra_datacite
- # https://guide.fatcat.wiki/entity_release.html
+ # Assemble release.
re = fatcat_openapi_client.ReleaseEntity(
work_id=None,
container_id=container_id,
release_type=release_type,
- release_stage=None,
- title=title, # attributes.titles, various titleType
+ release_stage=release_stage,
+ title=title,
subtitle=subtitle,
- original_title=title, # AlternativeTitle?
- release_year=release_year, # publicationYear
- release_date=release_date, # date issues/available?
- publisher=publisher, # attributes.publisher
+ original_title=title,
+ release_year=release_year,
+ release_date=release_date,
+ publisher=publisher,
ext_ids=fatcat_openapi_client.ReleaseExtIds(
- doi=attributes.get('doi'), # attributes.doi,
- # Can we add handle.net link?
+ doi=attributes.get('doi'),
),
contribs=contribs,
- volume=None,
- issue=None,
- pages=None,
+ volume=volume,
+ issue=issue,
+ pages=pages,
language=language,
- abstracts=None,
- refs=None,
+ abstracts=abstracts,
+ refs=refs,
extra=extra,
license_slug=license_slug,
)
return re
def try_update(self, re, debug=True):
+ """
+ When debug is true, write the RE to stdout.
+ """
if debug is True:
- # print(type(re))
- print(json.dumps(re.to_dict(), default=extended_encoder))
- return
- return False
+ print(json.dumps(re.to_dict(), default=extended_json_encoder))
+ return False
- def insert_batch(self, batch):
- # Debugging.
- for item in batch:
- print(item)
- return
+ # lookup existing DOI (don't need to try other ext idents for crossref)
+ existing = None
+ try:
+ existing = self.api.lookup_release(doi=re.ext_ids.doi)
+ except fatcat_openapi_client.rest.ApiException as err:
+ if err.status != 404:
+ raise err
+ # doesn't exist, need to update
+ return True
- # Orig.
+ # eventually we'll want to support "updates", but for now just skip if
+ # entity already exists
+ if existing:
+ self.counts['exists'] += 1
+ return False
+
+ return True
+
+ def insert_batch(self, batch):
self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
editgroup=fatcat_openapi_client.Editgroup(
description=self.editgroup_description,
extra=self.editgroup_extra),
entity_list=batch))
-def extended_encoder(value):
+def extended_json_encoder(value):
"""
- Can be used with json.dumps(value, default=extended_encoder) to serialize
+ Can be used with json.dumps(value, default=extended_json_encoder) to serialize
value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
"""
if isinstance(value, (datetime.datetime, datetime.date)):
@@ -585,7 +494,7 @@ def extended_encoder(value):
def lookup_license_slug(raw):
"""
- TODO(martin): reuse from crossref, maybe.
+ TODO(martin): reuse from or combine with crossref, maybe.
"""
if not raw:
return None