""" Prototype importer for datacite.org data. Example input document: https://api.datacite.org/dois/10.7916/d8-f93n-rk51 Datacite being an aggregator, the data is heterogeneous and exposes a couple of problems in content and structure. A few fields have their own parsing functions (parse_datacite_...), which may help testing. """ import collections import datetime import re import json import sqlite3 import sys import dateparser import fatcat_openapi_client import langdetect import pycountry from fatcat_tools.normal import clean_doi from fatcat_tools.transforms import entity_to_dict from .common import EntityImporter, clean # Cutoff length for abstracts. MAX_ABSTRACT_LENGTH = 2048 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary CONTAINER_TYPE_MAP = { 'Journal': 'journal', 'Series': 'journal', 'Book Series': 'book-series', } # The docs/guide should be the canonical home for these mappings; update there # first. Map various datacite type types to CSL-ish types. None means TODO or # remove. DATACITE_TYPE_MAP = { 'ris': { 'THES': 'thesis', 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) 'CHAP': 'chapter', 'FIGURE': 'figure', 'RPRT': 'report', 'JOUR': 'article-journal', 'MPCT': 'motion_picture', 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset 'BOOK': 'book', 'DATA': 'dataset', 'COMP': 'software', }, 'schemaOrg': { 'Dataset': 'dataset', 'Book': 'book', 'ScholarlyArticle': 'article-journal', 'ImageObject': 'graphic', 'Collection': None, 'MediaObject': None, 'Event': None, 'SoftwareSourceCode': 'software', 'Chapter': 'chapter', 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. 'PublicationIssue': 'article', 'AudioObject': None, 'Thesis': 'thesis', }, 'citeproc': { 'article': 'article', 'article-journal': 'article-journal', 'article-magazine': 'article-magazine', 'article-newspaper': 'article-newspaper', 'bill': 'bill', 'book': 'book', 'broadcast': 'broadcast', 'chapter': 'chapter', 'dataset': 'dataset', 'entry-dictionary': 'entry-dictionary', 'entry-encyclopedia': 'entry-encyclopedia', 'entry': 'entry', 'figure': 'figure', 'graphic': 'graphic', 'interview': 'interview', 'legal_case': 'legal_case', 'legislation': 'legislation', 'manuscript': 'manuscript', 'map': 'map', 'motion_picture': 'motion_picture', 'musical_score': 'musical_score', 'pamphlet': 'pamphlet', 'paper-conference': 'paper-conference', 'patent': 'patent', 'personal_communication': 'personal_communication', 'post': 'post', 'post-weblog': 'post-weblog', 'report': 'report', 'review-book': 'review-book', 'review': 'review', 'song': 'song', 'speech': 'speech', 'thesis': 'thesis', 'treaty': 'treaty', 'webpage': 'webpage', }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types 'bibtex': { 'phdthesis': 'thesis', 'inbook': 'chapter', 'misc': None, 'article': 'article-journal', 'book': 'book', }, 'resourceTypeGeneral': { 'Image': 'graphic', 'Dataset': 'dataset', 'PhysicalObject': None, 'Collection': None, 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" 'Sound': None, 'InteractiveResource': None, 'Event': None, 'Software': 'software', 'Other': None, 'Workflow': None, 'Audiovisual': None, } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 } # DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. DATACITE_UNKNOWN_MARKERS = ( '(:unac)', # temporarily inaccessible '(:unal)', # unallowed, suppressed intentionally '(:unap)', # not applicable, makes no sense '(:unas)', # value unassigned (e.g., Untitled) '(:unav)', # value unavailable, possibly unknown '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) '(:none)', # never had a value, never will '(:null)', # explicitly and meaningfully empty '(:tba)', # to be assigned or announced later '(:etal)', # too numerous to list (et alia) ) # UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking # unknown values. UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( 'NA', 'NN', 'n.a.', '[s.n.]', 'Unknown', ))) # UNKNOWN_MARKERS_LOWER are lowercase version of UNKNOWN blocklist. UNKNOWN_MARKERS_LOWER = set((v.lower() for v in UNKNOWN_MARKERS)) # Any "min" number of "tokens" will signal "spam", https://fatcat.wiki/release/rzcpjwukobd4pj36ipla22cnoi DATACITE_TITLE_SPAM_WORDGROUPS = [ { "tokens": ('full', 'movies', 'movie', 'watch', 'streaming', 'online', 'free', 'hd', 'download', 'english', 'subtitle', 'bluray'), "min": 4, } ] # TODO(martin): merge this with other maps and lookup functions, eventually. LICENSE_SLUG_MAP = { "//archaeologydataservice.ac.uk/advice/termsofuseandaccess.xhtml/": "ADS-UK", "//archaeologydataservice.ac.uk/advice/termsofuseandaccess/": "ADS-UK", "//arxiv.org/licenses/nonexclusive-distrib/1.0/": "ARXIV-1.0", "//doi.wiley.com/10.1002/tdm_license_1.1/": "WILEY-TDM-1.1", "//homepage.data-planet.com/terms-use/": "SAGE-DATA-PLANET", "//onlinelibrary.wiley.com/termsandconditions/": "WILEY", "//publikationen.bibliothek.kit.edu/kitopen-lizenz/": "KIT-OPEN", "//pubs.acs.org/page/policy/authorchoice_ccby_termsofuse.html/": "CC-BY", "//pubs.acs.org/page/policy/authorchoice_termsofuse.html/": "ACS-CHOICE", "//www.ametsoc.org/PUBSReuseLicenses/": "AMETSOC", "//www.apa.org/pubs/journals/resources/open-access.aspx/": "APA", "//www.biologists.com/user-licence-1-1/": "BIOLOGISTS-USER", "//www.elsevier.com/open-access/userlicense/1.0/": "ELSEVIER-USER-1.0", "//www.elsevier.com/tdm/userlicense/1.0/": "ELSEVIER-USER-1.0", "//www.gnu.org/licenses/gpl-3.0.en.html/": "GPLv3", "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html/": "GPLv2", "//www.karger.com/Services/SiteLicenses/": "KARGER", "//www.springer.com/tdm/": "SPRINGER-TDM", "//journals.sagepub.com/page/policies/text-and-data-mining-license/": "SAGE-TDM", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", "//creativecommons.org/publicdomain/mark/1.0": "CC-0", "//creativecommons.org/publicdomain/mark/1.0": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/": "CC-0", "//creativecommons.org/publicdomain/mark/1.0/deed.de": "CC-0", "//creativecommons.org/share-your-work/public-domain/cc0/": "CC-0", "//spdx.org/licenses/CC0-1.0.json": "CC-0", "//spdx.org/licenses/CC-BY-1.0.json": "CC-BY", "//spdx.org/licenses/CC-BY-4.0.json": "CC-BY", "//spdx.org/licenses/CC-BY-NC-4.0.json": "CC-BY-NC", "//spdx.org/licenses/CC-BY-SA-3.0.json": "CC-BY-SA", "//spdx.org/licenses/CC-BY-SA-4.0.json": "CC-BY-SA", "//spdx.org/licenses/MIT.json": "MIT", "//spdx.org/licenses/OGL-Canada-2.0.json": "OGL-Canada", } class DataciteImporter(EntityImporter): """ Importer for datacite records. """ def __init__(self, api, issn_map_file, debug=False, insert_log_file=None, **kwargs): eg_desc = kwargs.get( 'editgroup_description', "Automated import of Datacite DOI metadata, harvested from REST API" ) eg_extra = kwargs.get('editgroup_extra', dict()) eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter') super().__init__(api, issn_map_file=issn_map_file, editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) self.create_containers = kwargs.get('create_containers', True) extid_map_file = kwargs.get('extid_map_file') self.extid_map_db = None if extid_map_file: db_uri = "file:{}?mode=ro".format(extid_map_file) print("Using external ID map: {}".format(db_uri), file=sys.stderr) self.extid_map_db = sqlite3.connect(db_uri, uri=True) else: print("Not using external ID map", file=sys.stderr) self.read_issn_map_file(issn_map_file) self.debug = debug self.insert_log_file = insert_log_file self.this_year = datetime.datetime.now().year print('datacite with debug={}'.format(self.debug), file=sys.stderr) def lookup_ext_ids(self, doi): """ Return dictionary of identifiers referring to the same things as the given DOI. """ if self.extid_map_db is None: return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = self.extid_map_db.execute( "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", [doi.lower()]).fetchone() if row is None: return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None) row = [str(cell or '') or None for cell in row] return dict( core_id=row[0], pmid=row[1], pmcid=row[2], wikidata_qid=row[3], # TODO: arxiv_id=None, jstor_id=None, ) def parse_record(self, obj): """ Mapping datacite JSON to ReleaseEntity. """ if not obj or not isinstance(obj, dict): return None if 'attributes' not in obj: return None attributes = obj['attributes'] doi = clean_doi(attributes.get('doi', '').lower()) if not doi: print('skipping record without a DOI', file=sys.stderr) return if not str.isascii(doi): print('[{}] skipping non-ascii doi for now'.format(doi)) return None creators = attributes.get('creators', []) or [] contributors = attributes.get('contributors', []) or [] # Much fewer than creators. contribs = self.parse_datacite_creators(creators, doi=doi) # Beside creators, we have contributors in datacite. Sample: # ContactPerson, DataCollector, DataCurator, DataManager, Distributor, # Editor, Funder, HostingInstitution, Other, Producer, ProjectLeader, # ProjectMember, RelatedPerson, ResearchGroup, Researcher, # RightsHolder, Sponsor, Supervisor # # Datacite schema: # https://schema.datacite.org/meta/kernel-4.3/doc/DataCite-MetadataKernel_v4.3.pdf#page=32 # -- could be used as a form of controlled vocab? # # Currently (07/2020) in release_contrib: # # select count(*), role from release_contrib group by role; # count | role # -----------+------------ # 500269665 | author # 4386563 | editor # 17871 | translator # 10870584 | # (4 rows) # # Related: https://guide.fatcat.wiki/entity_release.html -- role # (string, of a set): the type of contribution, from a controlled # vocabulary. TODO: vocabulary needs review. contribs_extra_contributors = self.parse_datacite_creators(contributors, set_index=False, doi=doi) # Unfortunately, creators and contributors might overlap, refs GH59. for cc in contribs_extra_contributors: if contributor_list_contains_contributor(contribs, cc): continue contribs.append(cc) # Title, may come with "attributes.titles[].titleType", like # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" titles = attributes.get('titles', []) or [] title, original_language_title, subtitle = parse_datacite_titles( titles) if title is None: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False title = clean(title) if not title: print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) return False # check for blocklisted "spam", e.g. "FULL MOVIE" for rule in DATACITE_TITLE_SPAM_WORDGROUPS: seen = set() for token in rule.get("tokens", []): if token in title.lower(): seen.add(token) if len(seen) >= rule.get("min"): print("[{}] skipping spammy title: {}".format(doi, obj), file=sys.stderr) return False if not subtitle: subtitle = None else: subtitle = clean(subtitle) # Dates. A few internal dates (registered, created, updated) and # published (0..2554). We try to work with typed date list, in # "attributes.dates[].dateType", values: "Accepted", "Available" # "Collected", "Copyrighted", "Created", "Issued", "Submitted", # "Updated", "Valid". release_date, release_month, release_year = parse_datacite_dates( attributes.get('dates', [])) # block bogus far-future years/dates if release_year is not None and (release_year > (self.this_year + 5) or release_year < 1000): release_date = None release_month = None release_year = None # Some records do not use the "dates" field (e.g. micropub), but: # "attributes.published" or "attributes.publicationYear" if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date(attributes.get('publicationYear')) if not any((release_date, release_month, release_year)): release_date, release_month, release_year = parse_single_date(attributes.get('published')) if not any((release_date, release_month, release_year)): print('[{}] record w/o date: {}'.format(doi, obj), file=sys.stderr) # Start with clear stages, e.g. published. TODO(martin): we could # probably infer a bit more from the relations, e.g. # "IsPreviousVersionOf" or "IsNewVersionOf". release_stage = 'published' # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, # we might want something else than 'published'. See also: # https://support.datacite.org/docs/doi-states. # Publisher. A few NA values. A few bogus values. publisher = attributes.get('publisher') if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): publisher = None release_stage = None if publisher is not None and len(publisher) > 80: # Arbitrary magic value max length. TODO(martin): better heuristic, # but factored out; first we have to log misses. Example: # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt # werden" publisher = None if publisher: publisher = clean(publisher) # Container. For the moment, only ISSN as container. container_id = None container_name = None container = attributes.get('container', {}) or {} if container.get('type') in CONTAINER_TYPE_MAP.keys(): container_type = CONTAINER_TYPE_MAP.get(container['type']) if container.get('identifier') and container.get( 'identifierType') == 'ISSN': issn = container.get('identifier') if len(issn) == 8: issn = issn[:4] + "-" + issn[4:] issnl = self.issn2issnl(issn) if issnl is not None: container_id = self.lookup_issnl(issnl) if container_id is None and container.get('title'): container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'.format(doi, len(container_name))) container_name = container_name[0] assert isinstance(container_name, str) ce = fatcat_openapi_client.ContainerEntity( issnl=issnl, container_type=container_type, name=container_name, ) ce_edit = self.create_container(ce) container_id = ce_edit.ident self._issnl_id_map[issnl] = container_id else: # TODO(martin): factor this out into a testable function. # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 container_name = container.get('title') if isinstance(container_name, list): if len(container_name) > 0: print('[{}] too many container titles: {}'.format(doi, len(container_name))) container_name = container_name[0] # Exception: https://www.micropublication.org/, see: !MR24. if container_id is None and container_name is None: if publisher and publisher.lower().startswith('micropublication'): container_name = publisher # Volume and issue. volume = container.get('volume') issue = container.get('issue') if volume: volume = clean(volume) if issue: issue = clean(issue) # Pages. pages = None first_page = container.get('firstPage') last_page = container.get('lastPage') if first_page and last_page: try: _ = int(first_page) < int(last_page) pages = '{}-{}'.format(first_page, last_page) except ValueError as err: # noqa: F841 # TODO(martin): This is more debug than info. # print('[{}] {}'.format(doi, err), file=sys.stderr) pass if not pages and first_page: pages = first_page # License. license_slug = None license_extra = [] for lic in attributes.get('rightsList', []): slug = lookup_license_slug(lic.get('rightsUri')) if slug: license_slug = slug license_extra.append(lic) release_type = self.datacite_release_type(doi, attributes) # Language values are varied ("ger", "es", "English", "ENG", "en-us", # "other", ...). Try to crush it with langcodes: "It may sound to you # like langcodes solves a pretty boring problem. At one level, that's # right. Sometimes you have a boring problem, and it's great when a # library solves it for you." -- TODO(martin): We need more of these. language = None value = attributes.get('language', '') or '' try: language = pycountry.languages.lookup(value).alpha_2 except (LookupError, AttributeError) as err: # noqa: F841 pass # TODO(martin): Print this on debug level, only. # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) # Abstracts appear in "attributes.descriptions[].descriptionType", some # of the observed values: "Methods", "TechnicalInfo", # "SeriesInformation", "Other", "TableOfContents", "Abstract". The # "Other" fields might contain references or related articles (with # DOI). TODO(martin): maybe try to parse out some of those refs. abstracts = [] descs = attributes.get('descriptions', []) or [] for desc in descs: if not desc.get('descriptionType') == 'Abstract': continue # Description maybe a string, int or list. text = desc.get('description', '') if not text: continue if isinstance(text, int): text = '{}'.format(text) if isinstance(text, list): try: text = "\n".join(text) except TypeError: continue # Bail out, if it is not a list of strings. # Limit length. if len(text) < 10: continue if len(text) > MAX_ABSTRACT_LENGTH: text = text[:MAX_ABSTRACT_LENGTH] + " [...]" # Detect language. This is fuzzy and may be removed, if too unreliable. lang = None try: lang = langdetect.detect(text) except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) abstracts.append( fatcat_openapi_client.ReleaseAbstract( mimetype="text/plain", content=clean(text), lang=lang, )) # References and relations. Datacite include many relation types in # "attributes.relatedIdentifiers[].relationType", e.g. # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", # "IsDerivedFrom", "IsSourceOf". # # For the moment, we only care about References. refs, ref_index = [], 0 relIds = attributes.get('relatedIdentifiers', []) or [] for rel in relIds: if not rel.get('relationType', '') in ('References', 'Cites'): continue ref_extra = dict() if rel.get('relatedIdentifierType', '') == 'DOI': ref_extra['doi'] = rel.get('relatedIdentifier') if not ref_extra: ref_extra = None refs.append( fatcat_openapi_client.ReleaseRef( index=ref_index, extra=ref_extra, )) ref_index += 1 # More specific release_type via 'Reviews' relationsship. for rel in relIds: if rel.get('relatedIdentifierType', '') != 'Reviews': continue release_type = 'review' # Extra information. extra_datacite = dict() if license_extra: extra_datacite['license'] = license_extra if attributes.get('subjects'): extra_datacite['subjects'] = attributes['subjects'] # Include version information. metadata_version = attributes.get('metadataVersion') or '' if metadata_version: extra_datacite['metadataVersion'] = metadata_version # Include resource types. types = attributes.get('types', {}) or {} resource_type = types.get('resourceType', '') or '' resource_type_general = types.get('resourceTypeGeneral', '') or '' if resource_type and resource_type.lower() not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceType'] = resource_type if resource_type_general and resource_type_general.lower() not in UNKNOWN_MARKERS_LOWER: extra_datacite['resourceTypeGeneral'] = resource_type_general # Include certain relations from relatedIdentifiers. Keeping the # original structure of data here, which is a list of dicts, with # relation type, identifier and identifier type (mostly). relations = [] for rel in relIds: if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', 'IsVariantFormOf', 'IsSupplementTo', 'HasVersion', 'IsMetadataFor', 'IsNewVersionOf', 'IsIdenticalTo', 'IsVersionOf', 'IsDerivedFrom', 'IsSourceOf'): relations.append(rel) if relations: extra_datacite['relations'] = relations extra = dict() # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", # "10161", "10010691", "10780", # "Presentación" version = attributes.get('version') or None # top-level extra keys if not container_id and container_name: extra['container_name'] = container_name # Always include datacite key, even if value is empty (dict). extra['datacite'] = extra_datacite # Preparation for a schema update. if release_month: extra['release_month'] = release_month extids = self.lookup_ext_ids(doi=doi) # Assemble release. re = fatcat_openapi_client.ReleaseEntity( work_id=None, container_id=container_id, release_type=release_type, release_stage=release_stage, title=title, subtitle=subtitle, original_title=original_language_title, release_year=release_year, release_date=release_date, publisher=publisher, ext_ids=fatcat_openapi_client.ReleaseExtIds( doi=doi, pmid=extids['pmid'], pmcid=extids['pmcid'], wikidata_qid=extids['wikidata_qid'], core=extids['core_id'], arxiv=extids['arxiv_id'], jstor=extids['jstor_id'], ), contribs=contribs, volume=volume, issue=issue, pages=pages, language=language, abstracts=abstracts, refs=refs, extra=extra, license_slug=license_slug, version=version, ) re = self.biblio_hacks(re) return re @staticmethod def datacite_release_type(doi, attributes): """ Release type. Try to determine the release type from a variety of types supplied in datacite. The "attributes.types.resourceType" is uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP flows in 2009") citeproc may be the closest, but not always supplied. Order lookup roughly by completeness of mapping. """ release_type = None if not attributes.get('types'): return None types = attributes['types'] for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): value = types.get(typeType) release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) if release_type is not None: break # special case: figshare "collections" which group other entities if doi.startswith('10.6084/') or doi.startswith('10.25384'): if types.get('resourceType') == "Collection": release_type = "stub" if release_type is None: print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr) return release_type @staticmethod def biblio_hacks(re): """ This function handles known special cases. For example, publisher-specific or platform-specific workarounds. """ # only runs on datacite entities with a DOI assert re.ext_ids.doi # release_type exception: Global Biodiversity Information Facility # publishes highly interesting datasets, but titles are mostly the same # ("GBIF Occurrence Download" or "Occurrence Download"); set # release_type to "stub" (CSL/FC). if re.title == 'GBIF Occurrence Download' and re.ext_ids.doi.startswith('10.15468/dl.'): re.release_type = 'stub' # release_type exception: lots of "Experimental Crystal Structure Determination" # publisher: "Cambridge Crystallographic Data Centre" if re.ext_ids.doi.startswith('10.5517/'): re.release_type = 'entry' # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'): re.release_type = 'component' # figshare if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): # set version if DOI ends with versioned suffix doi_suffix = re.ext_ids.doi.split('.')[-1] if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): re.version = doi_suffix # "Figure 123 from " -> component # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" if " from " in re.title and re.release_type not in ('stub', 'graphic'): if re.title.startswith("Figure "): re.release_type = "component" elif re.title.startswith("Table "): re.release_type = "component" # figshare.com if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.extra.get('container_name') is None: re.extra['container_name'] = "figshare.com" return re def try_update(self, re): """ When debug is true, write the RE to stdout, not to the database. Might hide schema mismatch bugs. """ if self.debug is True: print(json.dumps(entity_to_dict(re, api_client=None))) return False # lookup existing DOI (don't need to try other ext idents for crossref) existing = None try: existing = self.api.lookup_release(doi=re.ext_ids.doi) except fatcat_openapi_client.rest.ApiException as err: if err.status != 404: raise err # eventually we'll want to support "updates", but for now just skip if # entity already exists if existing: self.counts['exists'] += 1 return False return True def insert_batch(self, batch): print('inserting batch ({})'.format(len(batch)), file=sys.stderr) if self.insert_log_file: with open(self.insert_log_file, 'a') as f: for doc in batch: json.dump(entity_to_dict(doc, api_client=None), f) f.write('\n') self.api.create_release_auto_batch( fatcat_openapi_client.ReleaseAutoBatch( editgroup=fatcat_openapi_client.Editgroup( description=self.editgroup_description, extra=self.editgroup_extra), entity_list=batch)) def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): """ Parses a list of creators into a list of ReleaseContrib objects. Set set_index to False, if the index contrib field should be left blank. The doi parameter is only used for debugging. """ # Contributors. Many nameIdentifierSchemes, we do not use (yet): # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. contribs = [] # Names, that should be ignored right away. name_blocklist = set(('Occdownload Gbif.Org',)) i = 0 for c in creators: if not set_index: i = None nameType = c.get('nameType', '') or '' if nameType in ('', 'Personal'): creator_id = None for nid in c.get('nameIdentifiers', []) or []: if not isinstance(nid, dict): # see: fatcat-workers/issues/44035/ print('unexpected nameIdentifiers, expected list of dicts, got: {}'.format(nid), file=sys.stderr) continue name_scheme = nid.get('nameIdentifierScheme', '') or '' if not name_scheme.lower() == "orcid": continue orcid = nid.get('nameIdentifier') or '' orcid = orcid.replace('https://orcid.org/', '') if not orcid: continue creator_id = self.lookup_orcid(orcid) # TODO(martin): If creator_id is None, should we create creators? # If there are multiple affiliation strings, use the first one. affiliations = c.get('affiliation', []) or [] raw_affiliation = None if len(affiliations) == 0: raw_affiliation = None else: raw_affiliation = clean(affiliations[0]) name = c.get('name') given_name = c.get('givenName') surname = c.get('familyName') if name: name = clean(name) if not any((name, given_name, surname)): continue if not name: name = "{} {}".format(given_name or '', surname or '').strip() if name in name_blocklist: continue if name.lower() in UNKNOWN_MARKERS_LOWER: continue # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. if name: name = index_form_to_display_name(name) if given_name: given_name = clean(given_name) if surname: surname = clean(surname) # Perform a final assertion that name does not reduce to zero # (e.g. whitespace only name). if name: name = name.strip() if not name: continue if raw_affiliation == '': continue extra = None # "DataManager", "DataCurator", "ContactPerson", "Distributor", # "RegistrationAgency", "Sponsor", "Researcher", # "RelatedPerson", "ProjectLeader", "Editor", "Other", # "ProjectMember", "Funder", "RightsHolder", "DataCollector", # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" contributorType = c.get('contributorType', '') or '' if contributorType: extra = {'type': contributorType} rc = fatcat_openapi_client.ReleaseContrib( creator_id=creator_id, index=i, raw_name=name, given_name=given_name, surname=surname, role=role, raw_affiliation=raw_affiliation, extra=extra, ) # Filter out duplicates early. if not contributor_list_contains_contributor(contribs, rc): contribs.append(rc) if i is not None: i += 1 elif nameType == 'Organizational': name = c.get('name', '') or '' if name in UNKNOWN_MARKERS: continue if len(name) < 3: continue extra = {'organization': name} contribs.append(fatcat_openapi_client.ReleaseContrib( index=i, extra=extra)) if i is not None: i += 1 else: print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) return contribs def contributor_list_contains_contributor(contributor_list, contributor): """ Given a list of contributors, determine, whether contrib is in that list. """ for cc in contributor_list: if cc.raw_name != contributor.raw_name: continue cc_role = cc.role or 'author' contributor_role = contributor.role or 'author' if cc_role != contributor_role: continue return True return False def lookup_license_slug(raw): """ Resolve a variety of strings into a some pseudo-canonical form, e.g. CC-BY-ND, CC-0, MIT and so on. TODO(martin): reuse from or combine with crossref, maybe. """ if not raw: return None if 'creativecommons.org/publicdomain/zero' in raw: return 'CC-0' if raw.lower().endswith('/cc0'): return 'CC-0' if 'creativecommons' in raw: # https://creativecommons.org/publicdomain/mark/1.0/deed.de if 'creativecommons.org/publicdomain' in raw: return 'CC-PUBLICDOMAIN' if 'creativecommons.org/share-your-work/public-domain/cc0' in raw: return 'CC-0' # https://creativecommons.org/licenses/by/4.0/deed.es_ES raw = raw.lower() match = re.search(r'creativecommons.org/licen[sc]es/(?P<name>[a-z-]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None if not name.startswith('cc'): name = 'cc-{}'.format(name) return name.upper() if 'opensource.org' in raw: # https://opensource.org/licenses/alphabetical, e.g. opensource.org/licenses/EUPL-1.2 match = re.search(r'opensource.org/licenses/(?P<name>[^/]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None if len(name) > 11: return None return name.upper() if 'gnu.org' in raw: # http://www.gnu.org/copyleft/gpl, https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html match = re.search(r'/(?P<name>fdl(-[0-9.]*[0-9]+)?|gpl(-[0-9.]*[0-9]+)?|lgpl(-[0-9.]*[0-9]+)|aglp(-[0-9.]*[0-9]+)?)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None if len(name) > 8: return None return name.upper() if 'spdx.org' in raw: if 'spdx.org/licenses/CC0' in raw: return 'CC-0' # https://spdx.org/licenses/CC-BY-NC-ND-4.0.html match = re.search(r'spdx.org/licenses/(?P<name>[a-z0-9-]+)', raw, re.IGNORECASE) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None if len(name) > 36: return None # cleanup version and extensions name = re.sub('(-[0-9])?[.]?[0-9]?(.json|.html)?', '', name.lower()) return name.upper() if 'rightsstatements.org' in raw: # http://rightsstatements.org/vocab/InC/1.0/ match = re.search(r'rightsstatements.org/(vocab|page)/(?P<name>[^/]*)', raw) if not match: print('missed potential license: {}'.format(raw), file=sys.stderr) return None name = match.groupdict().get('name') if not name: return None if len(name) > 9: return None return 'RS-{}'.format(name.upper()) # Fallback to mapped values. raw = raw.lower() raw = raw.strip().replace('http://', '//').replace('https://', '//') if not raw.endswith('/'): raw = raw + '/' return LICENSE_SLUG_MAP.get(raw) def find_original_language_title(item, min_length=4, max_questionmarks=3): """ Perform a few checks before returning a potential original language title. Example input: {'title': 'Some title', 'original_language_title': 'Some title'} """ if not 'original_language_title' in item: return None title = item.get('title') if not title: return None original_language_title = item.get('original_language_title') if isinstance(original_language_title, str) and title != original_language_title: if len(original_language_title) < min_length: return None if original_language_title.count('?') > max_questionmarks: return None return original_language_title if isinstance(original_language_title, dict): content = original_language_title.get('__content__', '') or '' if content and content != title and not content.count( '?') > max_questionmarks: return content return None def parse_datacite_titles(titles): """ Given a list of title items from datacite, return 3-tuple (title, original_language_title, subtitle). Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}] """ title, original_language_title, subtitle = None, None, None if titles is None: return title, original_language_title, subtitle if len(titles) == 0: return title, original_language_title, subtitle elif len(titles) == 1: original_language_title = find_original_language_title(titles[0]) title = titles[0].get('title', '') or '' title = title.strip() if not title: title = None return title, original_language_title, subtitle else: for entry in titles: if not title and ('titleType' not in entry or not entry.get('titleType')): title = entry.get('title').strip() if not subtitle and entry.get('titleType') == 'Subtitle': subtitle = entry.get('title', '').strip() if not original_language_title: original_language_title = find_original_language_title(entry) return title, original_language_title, subtitle def parse_single_date(value): """ Given a single string containing a date in arbitrary format, try to return tuple (date: datetime.date, month: int, year: int). """ if not value: return None, None, None if isinstance(value, int): value = str(value) parser = dateparser.DateDataParser() try: # Results in a dict with keys: date_obj, period, locale. parse_result = parser.get_date_data(value) # A datetime object, later we need a date, only. result = parse_result['date_obj'] if result is not None: if parse_result['period'] == 'year': return None, None, result.year elif parse_result['period'] == 'month': return None, result.month, result.year else: return result.date(), result.month, result.year except TypeError as err: print("{} date parsing failed with: {}".format(value, err), file=sys.stderr) return None, None, None def parse_datacite_dates(dates): """ Given a list of date fields (under .dates), return tuple, (release_date, release_year). """ release_date, release_month, release_year = None, None, None if not dates: return release_date, release_month, release_year if not isinstance(dates, list): raise ValueError('expected a list of date items') # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", # "Collected", "Updated", "Copyrighted", "Created" # Ignored for now: "Collected", "Issued" date_type_prio = ( 'Valid', 'Available', 'Accepted', 'Submitted', 'Copyrighted', 'Created', 'Updated', ) # We need to note the granularity, since a string like "2019" would be # parsed into "2019-01-01", even though the month is unknown. Use 3 # granularity types: 'y', 'm', 'd'. Pattern = collections.namedtuple('Pattern', 'layout granularity') # Before using (expensive) dateparser, try a few common patterns. common_patterns = ( Pattern('%Y-%m-%d', 'd'), Pattern('%Y-%m', 'm'), Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), Pattern('%Y-%m-%dT%H:%M:%S', 'd'), Pattern('%Y', 'y'), ) def parse_item(item): result, value, year_only = None, str(item.get('date', '')) or '', False release_date, release_month, release_year = None, None, None for layout, granularity in common_patterns: try: result = datetime.datetime.strptime(value, layout) except ValueError: continue else: if granularity == 'y': year_only = True break if result is None: print('fallback for {}'.format(value), file=sys.stderr) release_date, release_month, release_year = parse_single_date(value) if result is None: # Unparsable date. return release_date, release_month, release_year if granularity != 'y': release_date = result.date() release_year = result.year if granularity in ('m', 'd'): release_month = result.month return release_date, release_month, release_year today = datetime.date.today() for prio in date_type_prio: for item in dates: if not item.get('dateType') == prio: continue release_date, release_month, release_year = parse_item(item) if release_date is None and release_year is None: continue if release_year < 1000 or release_year > today.year + 5: # Skip possibly bogus dates. release_year = None continue break else: continue break if release_date is None and release_year is None: for item in dates: release_date, release_month, release_year = parse_item(item) if release_year or release_date: break return release_date, release_month, release_year def index_form_to_display_name(s): """ Try to convert an index form name, like 'Razis, Panos A' into display_name, e.g. 'Panos A Razis'. """ if ',' not in s: return s skip_on_chars = ['(', ')', '*'] for char in skip_on_chars: if char in s: return s if s.count(',') > 1: # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" return s # Not names, but sprinkled in fields where authors live. stopwords = [s.lower() for s in ( 'Archive', 'Collection', 'Coordinator', 'Department', 'Germany', 'International', 'National', 'Netherlands', 'Office', 'Organisation', 'Organization', 'Service', 'Services', 'United States', 'University', 'Verein', 'Volkshochschule', )] lower = s.lower() for stop in stopwords: if stop in lower: return s a, b = s.split(',') return '{} {}'.format(b.strip(), a.strip())