From 76d6d4d2de6580ae147e40c43c18f04cc48b62ec Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 17 Dec 2019 17:38:45 +0100 Subject: datacite: add missing mappings and notes --- python/fatcat_tools/importers/datacite.py | 441 ++++++++++++------------------ 1 file changed, 175 insertions(+), 266 deletions(-) (limited to 'python/fatcat_tools/importers') diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index e486ba90..4e117dde 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,5 +1,5 @@ """ -WIP: Importer for datacite.org data. +Prototype Importer for datacite.org data. Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8 """ @@ -8,9 +8,11 @@ from .common import EntityImporter import dateparser import langcodes import datetime +import langdetect import fatcat_openapi_client import json import sys +import hashlib # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { @@ -20,40 +22,8 @@ CONTAINER_TYPE_MAP = { } # The docs/guide should be the cannonical home for these mappings; update there -# first. -# -# > select count(*), release_type from release_rev group by release_type order by count(*) desc; -# -# count | release_type -# ----------+------------------- -# 95030004 | article-journal -# 13477878 | chapter -# 5926811 | paper-conference -# 2169642 | article -# 1806415 | dataset -# 1548614 | book -# 1390304 | -# 818351 | report -# 815684 | entry -# 307998 | standard -# 297769 | thesis -# 261426 | letter -# 148093 | post -# 122736 | editorial -# 99225 | stub -# 96219 | review-book -# 22854 | peer_review -# 19078 | interview -# 16278 | article-newspaper -# 3973 | speech -# 3536 | legal_case -# 2264 | abstract -# 1626 | legislation -# 1053 | retraction -# 85 | component -# (25 rows) -# -# Map various datacite type types to CSL-ish types. None means TODO or remove. +# first. Map various datacite type types to CSL-ish types. None means TODO or +# remove. DATACITE_TYPE_MAP = { 'ris': { 'THES': 'thesis', @@ -197,91 +167,17 @@ class DataciteImporter(EntityImporter): def parse_record(self, obj): """ - TODO(martin): Map datacite to RE. - - WIP, notes: - - * Many subjects, should they end up in extra? - * attributes.creators and attributes.contributors - - $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr - 3963663 "Personal" - 289795 null - 8892 "Organizational" - - Shall we use issued, available? - - { - "date": "2011-11-18", - "dateType": "Accepted" - }, - { - "date": "2011-11-18", - "dateType": "Available" - }, - { - "date": "2011-11-07", - "dateType": "Copyrighted" - }, - { - "date": "2011-11-18", - "dateType": "Issued" - }, - { - "date": "2011-11-07", - "dateType": "Issued" - } - - TODO(martin): Quick analysis of dates and stages. + Mapping datacite JSON to ReleaseEntity. """ - if 'attributes' not in obj: return None attributes = obj['attributes'] - # > Contributors - # - # "attributes.creators[].contributorType": [ - # "author" - # ], - # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ - # "LCNA", - # "GND", - # "email", - # "NAF", - # "OSF", - # "RRID", - # "ORCID", - # "SCOPUS", - # "NRCPID", - # "schema.org", - # "GRID", - # "MGDS", - # "VIAF", - # "JACoW-ID" - # ], - # - # "https://orcid.org/0000-0002-9902-738X", - # "http://jacow.org/JACoW-00001280", - # "Wiebe_Peter", - # "https://osf.io/https://osf.io/kjfuy/", - # "http://www.viaf.org176549220", - # "2239", - # "Jeffries_Martin", - # "https://orcid.org/0000-0002-1493-6630", - # "0000-0002-6233-612X", - # - # "creators": [ - # { - # "name": "Bögli, Hans", - # "nameType": "Personal", - # "givenName": "Hans", - # "familyName": "Bögli", - # "affiliation": [] - # } - # ], - + # Contributors. Many nameIdentifierSchemes, we do not use yet: + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ + # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS", + # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ], contribs = [] for i, c in enumerate(attributes['creators']): @@ -304,15 +200,8 @@ class DataciteImporter(EntityImporter): surname=c.get('familyName'), )) - # > Title - # - # "attributes.titles[].titleType": [ - # "AlternativeTitle", - # "Other", - # "Subtitle", - # null, - # "TranslatedTitle" - # ], + # Title, may come with "attributes.titles[].titleType", like + # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" title, subtitle = None, None for entry in attributes.get('titles', []): @@ -321,22 +210,13 @@ class DataciteImporter(EntityImporter): if entry.get('titleType') == 'Subtitle': subtitle = entry.get('title').strip() - # > Dates - # - # "attributes.dates[].dateType": [ - # "Accepted", - # "Available" - # "Collected", - # "Copyrighted", - # "Created", - # "Issued", - # "Submitted", - # "Updated", - # "Valid", - # ], - # - # Different documents have different dates defined. Choose the topmost - # available from prio list. + # Dates. A few internal dates (registered, created, updated) and + # published (0..2554). We try to work with typed date list, in + # "attributes.dates[].dateType", values: "Accepted", "Available" + # "Collected", "Copyrighted", "Created", "Issued", "Submitted", + # "Updated", "Valid". + release_year, release_date = None, None + date_type_prio = ( 'Valid', 'Issued', @@ -348,14 +228,16 @@ class DataciteImporter(EntityImporter): 'Created', 'Updated', ) - - release_year, release_date = None, None for prio in date_type_prio: dates = attributes.get('dates', []) or [] # Never be None. for item in dates: if not item.get('dateType') == prio: continue - result = dateparser.parse(item.get('date')) + try: + result = dateparser.parse(item.get('date')) + except TypeError as err: + print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr) + continue if result is None: # Unparsable date. continue @@ -369,56 +251,23 @@ class DataciteImporter(EntityImporter): continue break - # > Publisher - # - # A few NA values. A few bogus values. - # + # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') - if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'): + if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'): publisher = None if publisher is not None and len(publisher) > 80: - # Arbitrary magic value, TODO(martin): better heuristic. - # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, - # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt, - # Nutzungsrechte müssen durch den Nutzer abgeklärt werden", - # TODO(martin): log misses. + # Arbitrary magic value max length. TODO(martin): better heuristic, + # but factored out; first we have to log misses. Example: + # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, + # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / + # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt + # werden" publisher = None - # > Container - # - # For the moment, only ISSN as container. - # - # "container": { - # "type": "Journal", - # "issue": "8", - # "title": "Angewandte Chemie International Edition", - # "volume": "57", - # "lastPage": "2080", - # "firstPage": "2077", - # "identifier": "14337851", - # "identifierType": "ISSN" - # }, - # - # "attributes.container.type": [ - # "DataRepository", - # "Journal", - # "Series", - # "Book Series" - # ], - # - # "attributes.container.identifierType": [ - # "Handle", - # "ISBN", - # "LISSN", - # "DOI", - # "EISSN", - # "URL", - # "ISSN" - # ], - # - + # Container. For the moment, only ISSN as container. container_id = None + container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) @@ -440,142 +289,202 @@ class DataciteImporter(EntityImporter): container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id - # > License - # - # attributes.rightsList[].rightsUri - # attributes.rightsList[].rights - # attributes.rightsList[].lang - # + # Volume and issue. + volume = container.get('volume') + issue = container.get('issue') + + # Pages. + pages = None + + first_page = container.get('firstPage') + last_page = container.get('lastPage') + + if first_page and last_page: + try: + int(first_page) < int(last_page) + pages = '{}-{}'.format(first_page, last_page) + except ValueError as err: + print(err, file=sys.stderr) + pass + + if not pages and first_page: + pages = first_page + # License. license_slug = None license_extra = [] + for l in attributes.get('rightsList', []): slug = lookup_license_slug(l.get('rightsUri')) if slug: license_slug = slug license_extra.append(l) - # > Release type. - # - # Datacite has some fine granular typing (e.g. "Supplementary - # Collection of Datasets", "Taxonomic treatment", "blog_entry", ... - # - # Additional, coarse: resourceTypeGeneral - # - # "attributes.types.resourceTypeGeneral": [ - # "Image", - # "Dataset", - # "PhysicalObject", - # "Collection", - # "Text", - # "Sound", - # "InteractiveResource", - # "Event", - # "Software", - # "Other", - # "Workflow", - # "Audiovisual" - # ], - # "attributes.types.citeproc": [ - # "dataset", - # "chapter", - # "article-journal", - # "song", - # "article", - # "report", - # "graphic", - # "thesis", - # "book" - # ], - # - # There is RIS, also. - - # attributes.types.resourceType contains too many things for now. + # Release type. Try to determine the release type from a variety of + # types supplied in datacite. The "attributes.types.resourceType" + # contains too many (176 in sample) things for now; citeproc may be the + # closest, but not always supplied. for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): - release_type = attributes.get('types', {}).get(typeType) + value = attributes.get('types', {}).get(typeType) + release_type = DATACITE_TYPE_MAP.get(value) if release_type is not None: break - # TODO(martin): Skip unmapped release_type entirely? if release_type is None: print("datacite unmapped type: {}".format(release_type), file=sys.stderr) - # > Language. - # attributes.language - + # Language values are varied ("ger", "es", "English", "ENG", "en-us", + # "other", ...). Try to crush it with langcodes: "It may sound to you + # like langcodes solves a pretty boring problem. At one level, that's + # right. Sometimes you have a boring problem, and it's great when a + # library solves it for you." -- TODO(martin): We need more of these. language = None - value = attributes.get('language', '') or '' # As it is written. + + value = attributes.get('language', '') or '' try: language = langcodes.find(value).language except LookupError: try: language = langcodes.get(value).language except langcodes.tag_parser.LanguageTagError: + print('could not determine language: {}'.format(value), file=sys.stderr) + + # Abstracts appear in "attributes.descriptions[].descriptionType", some + # of the observed values: "Methods", "TechnicalInfo", + # "SeriesInformation", "Other", "TableOfContents", "Abstract". The + # "Other" fields might contain references or related articles (with + # DOI). TODO(martin): maybe try to parse out some of those refs. + abstracts = [] + + for desc in attributes.get('descriptions', []): + if not desc.get('descriptionType') == 'Abstract': + continue + if len(desc.get('description', '')) < 10: + continue + text = desc.get('description') + sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() + lang = None + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: pass + abstracts.append(fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=text, + sha1=sha1, + lang=lang, + )) + + # References and relations. Datacite include many relation types in + # "attributes.relatedIdentifiers[].relationType", e.g. + # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", + # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", + # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", + # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", + # "IsDerivedFrom", "IsSourceOf". + # + # For the moment, we only care about References. + refs, ref_index = [], 0 + + for rel in attributes.get('relatedIdentifiers', []): + if not rel.get('relationType') == 'References': + continue + ref_extra = dict() + if rel.get('relatedIdentifierType') == 'DOI': + ref_extra['doi'] = rel.get('relatedIdentifier') + if not ref_extra: + ref_extra = None + refs.append(fatcat_openapi_client.ReleaseRef( + index=ref_index, + extra=ref_extra, + )) + ref_index += 1 + + # Start with clear stages, e.g. published. TODO(martin): we could + # probably infer a bit more from the relations, e.g. + # "IsPreviousVersionOf" or "IsNewVersionOf". + release_stage = None + if attributes.get('state') == 'findable' or attributes.get('isActive') is True: + release_stage = 'published' + + # Extra information. + extra_datacite = dict() - # > Extra information: license, subjects, ... - extra, extra_datacite = dict(), dict() if license_extra: - extra_datacite = { - 'license': license_extra, - } + extra_datacite['license'] = license_extra if attributes.get('subjects'): - extra_datacite['subjects'] = attributes.get('subjects', []) + extra_datacite['subjects'] = attributes['subjects'] + if attributes.get('url'): + extra_datacite['url'] = attributes['url'] + + extra = dict() if extra_datacite: extra['datacite'] = extra_datacite - # https://guide.fatcat.wiki/entity_release.html + # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, - release_stage=None, - title=title, # attributes.titles, various titleType + release_stage=release_stage, + title=title, subtitle=subtitle, - original_title=title, # AlternativeTitle? - release_year=release_year, # publicationYear - release_date=release_date, # date issues/available? - publisher=publisher, # attributes.publisher + original_title=title, + release_year=release_year, + release_date=release_date, + publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( - doi=attributes.get('doi'), # attributes.doi, - # Can we add handle.net link? + doi=attributes.get('doi'), ), contribs=contribs, - volume=None, - issue=None, - pages=None, + volume=volume, + issue=issue, + pages=pages, language=language, - abstracts=None, - refs=None, + abstracts=abstracts, + refs=refs, extra=extra, license_slug=license_slug, ) return re def try_update(self, re, debug=True): + """ + When debug is true, write the RE to stdout. + """ if debug is True: - # print(type(re)) - print(json.dumps(re.to_dict(), default=extended_encoder)) - return - return False + print(json.dumps(re.to_dict(), default=extended_json_encoder)) + return False - def insert_batch(self, batch): - # Debugging. - for item in batch: - print(item) - return + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.ext_ids.doi) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True - # Orig. + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) -def extended_encoder(value): +def extended_json_encoder(value): """ - Can be used with json.dumps(value, default=extended_encoder) to serialize + Can be used with json.dumps(value, default=extended_json_encoder) to serialize value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage """ if isinstance(value, (datetime.datetime, datetime.date)): @@ -585,7 +494,7 @@ def extended_encoder(value): def lookup_license_slug(raw): """ - TODO(martin): reuse from crossref, maybe. + TODO(martin): reuse from or combine with crossref, maybe. """ if not raw: return None -- cgit v1.2.3