diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-01-08 23:31:40 +0100 |
commit | 081746837a55bf5f34c96f12f1abb5a00d5b478c (patch) | |
tree | 88af1ade558ad6695918d36648b3ed4a5bea6954 /python/fatcat_tools/importers | |
parent | 27723a61bde5591bae8115d801d0d09b7ef01b03 (diff) | |
parent | 277bd183d7139bb1a8857bc2a48c0aa92012455d (diff) | |
download | fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.tar.gz fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.zip |
Merge branch 'martin-datacite-import'
Pipfile.lock is broken.
* martin-datacite-import: (68 commits)
datacite: pass in doi into factored out method
datacite: reformat test cases and use jq . --sort-keys
datacite: factor out contributor handling
datacite: catch type mismatch in language detection
datacite: adjust tests for release_month
datacite: name extra.month, extra.release_month
datacite: mark additional files as stub
datacite: CCDC are entries, mostly
datacite: use more specific release_type, if possible
datacite: ignore certain names
datacite: over 3% records have the same title: stub
datacite: fill a few more release_type gaps
datacite: adding datacite-specific extra metadata
datacite: apply pylint suggestions
datacite: fix typos
datacite: set release_stage to published by default
datacite: month field should be top-level
datacite: include month in extra
datacite: indicate mismatched file in test
datacite: clean abstracts, use unknown value tokens
...
Diffstat (limited to 'python/fatcat_tools/importers')
-rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 1023 |
2 files changed, 1024 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index bb9c5b17..d936605f 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -14,6 +14,7 @@ To run an import you combine two classes; one each of: from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug +from .datacite import DataciteImporter from .jalc import JalcImporter from .jstor import JstorImporter from .arxiv import ArxivRawImporter diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py new file mode 100644 index 00000000..b1862b44 --- /dev/null +++ b/python/fatcat_tools/importers/datacite.py @@ -0,0 +1,1023 @@ +""" +Prototype importer for datacite.org data. + +Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. + +Datacite being an aggregator, the data is varied and exposes a couple of +problems in content and structure. A few fields habe their own parsing +functions (parse_datacite_...), which can be tested more easily. +""" + +import collections +import datetime +import hashlib +import json +import sqlite3 +import sys + +import dateparser +import fatcat_openapi_client +import langdetect +import pycountry + +from fatcat_tools.normal import clean_doi +from fatcat_tools.transforms import entity_to_dict + +from .common import EntityImporter, clean + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 + +# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary +CONTAINER_TYPE_MAP = { + 'Journal': 'journal', + 'Series': 'journal', + 'Book Series': 'book-series', +} + +# The docs/guide should be the cannonical home for these mappings; update there +# first. Map various datacite type types to CSL-ish types. None means TODO or +# remove. +DATACITE_TYPE_MAP = { + 'ris': { + 'THES': 'thesis', + 'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) + 'CHAP': 'chapter', + 'FIGURE': 'figure', + 'RPRT': 'report', + 'JOUR': 'article-journal', + 'MPCT': 'motion_picture', + 'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset + 'BOOK': 'book', + 'DATA': 'dataset', + 'COMP': 'software', + }, + 'schemaOrg': { + 'Dataset': 'dataset', + 'Book': 'book', + 'ScholarlyArticle': 'article-journal', + 'ImageObject': 'graphic', + 'Collection': None, + 'MediaObject': None, + 'Event': None, + 'SoftwareSourceCode': 'software', + 'Chapter': 'chapter', + 'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. + 'PublicationIssue': 'article', + 'AudioObject': None, + 'Thesis': 'thesis', + }, + 'citeproc': { + 'article': 'article', + 'article-journal': 'article-journal', + 'article-magazine': 'article-magazine', + 'article-newspaper': 'article-newspaper', + 'bill': 'bill', + 'book': 'book', + 'broadcast': 'broadcast', + 'chapter': 'chapter', + 'dataset': 'dataset', + 'entry-dictionary': 'entry-dictionary', + 'entry-encyclopedia': 'entry-encyclopedia', + 'entry': 'entry', + 'figure': 'figure', + 'graphic': 'graphic', + 'interview': 'interview', + 'legal_case': 'legal_case', + 'legislation': 'legislation', + 'manuscript': 'manuscript', + 'map': 'map', + 'motion_picture': 'motion_picture', + 'musical_score': 'musical_score', + 'pamphlet': 'pamphlet', + 'paper-conference': 'paper-conference', + 'patent': 'patent', + 'personal_communication': 'personal_communication', + 'post': 'post', + 'post-weblog': 'post-weblog', + 'report': 'report', + 'review-book': 'review-book', + 'review': 'review', + 'song': 'song', + 'speech': 'speech', + 'thesis': 'thesis', + 'treaty': 'treaty', + 'webpage': 'webpage', + }, # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types + 'bibtex': { + 'phdthesis': 'thesis', + 'inbook': 'chapter', + 'misc': None, + 'article': 'article-journal', + 'book': 'book', + }, + 'resourceTypeGeneral': { + 'Image': 'graphic', + 'Dataset': 'dataset', + 'PhysicalObject': None, + 'Collection': None, + 'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" + 'Sound': None, + 'InteractiveResource': None, + 'Event': None, + 'Software': 'software', + 'Other': None, + 'Workflow': None, + 'Audiovisual': None, + } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +} + +# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. +DATACITE_UNKNOWN_MARKERS = ( + '(:unac)', # temporarily inaccessible + '(:unal)', # unallowed, suppressed intentionally + '(:unap)', # not applicable, makes no sense + '(:unas)', # value unassigned (e.g., Untitled) + '(:unav)', # value unavailable, possibly unknown + '(:unkn)', # known to be unknown (e.g., Anonymous, Inconnue) + '(:none)', # never had a value, never will + '(:null)', # explicitly and meaningfully empty + '(:tba)', # to be assigned or announced later + '(:etal)', # too numerous to list (et alia) +) + +# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking +# unknown values. +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( + 'NA', + 'NN', + 'n.a.', + '[s.n.]', +))) + +# TODO(martin): merge this with other maps, maybe. +LICENSE_SLUG_MAP = { + "//creativecommons.org/licenses/by/2.0/": "CC-BY", + "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY", + "//creativecommons.org/licenses/by/3.0/": "CC-BY", + "//creativecommons.org/licenses/by/3.0/us": "CC-BY", + "//creativecommons.org/licenses/by/4.0/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY", + "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY", + "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", + "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND", + "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", + "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", + "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", + "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA", + "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY", + "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", + "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", + "//opensource.org/licenses/MIT": "MIT", + "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", + "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", + "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", + "//www.karger.com/Services/SiteLicenses": "KARGER", + "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", + "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", + "//www.opensource.org/licenses/EUPL-1.1": + "EUPL-1.1", # redirects to EUPL-1.2 + "//www.opensource.org/licenses/MIT": "MIT", + # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ + # "http://rsc.li/journals-terms-of-use": "RSC", + # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG. + # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "", + # "http://www.springer.com/tdm": "", + # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html + # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "", + # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "", + # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "", + # "https://rightsstatements.org/page/InC/1.0?language=en": "", + # "https://services.ceda.ac.uk/cedasite/register/info": "", + # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404 + # "https://www.cambridge.org/core/terms": "", + # "https://www.elsevier.com/tdm/userlicense/1.0", + # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights + # "info:eu-repo/semantics/embargoedAccess": "", + # "info:eu-repo/semantics/openAccess": "", + # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice). +} + +# TODO(martin): drop this after 3.7 upgrade +try: + isascii = str.isascii # new in 3.7, https://docs.python.org/3/library/stdtypes.html#str.isascii +except AttributeError: + isascii = lambda s: len(s) == len(s.encode()) + + +class DataciteImporter(EntityImporter): + """ + Importer for datacite records. + """ + def __init__(self, + api, + issn_map_file, + debug=False, + insert_log_file=None, + **kwargs): + + eg_desc = kwargs.get( + 'editgroup_description', + "Automated import of Datacite DOI metadata, harvested from REST API" + ) + eg_extra = kwargs.get('editgroup_extra', dict()) + eg_extra['agent'] = eg_extra.get('agent', + 'fatcat_tools.DataciteImporter') + super().__init__(api, + issn_map_file=issn_map_file, + editgroup_description=eg_desc, + editgroup_extra=eg_extra, + **kwargs) + + self.create_containers = kwargs.get('create_containers', True) + extid_map_file = kwargs.get('extid_map_file') + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri), file=sys.stderr) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map", file=sys.stderr) + + self.read_issn_map_file(issn_map_file) + self.debug = debug + self.insert_log_file = insert_log_file + + print('datacite with debug={}'.format(self.debug), file=sys.stderr) + + def lookup_ext_ids(self, doi): + """ + Return dictionary of identifiers refering to the same things as the given DOI. + """ + if self.extid_map_db is None: + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = self.extid_map_db.execute( + "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, + pmid=None, + pmcid=None, + wikidata_qid=None, + arxiv_id=None, + jstor_id=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3], + # TODO: + arxiv_id=None, + jstor_id=None, + ) + + def parse_record(self, obj): + """ + Mapping datacite JSON to ReleaseEntity. + """ + if not obj or not isinstance(obj, dict): + return None + if 'attributes' not in obj: + return None + + attributes = obj['attributes'] + doi = clean_doi(attributes.get('doi', '').lower()) + + if not isascii(doi): + print('[{}] skipping non-ascii doi for now'.format(doi)) + return None + + + creators = attributes.get('creators', []) or [] + contributors = attributes.get('contributors', []) or [] # Much fewer than creators. + + contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + + # Title, may come with "attributes.titles[].titleType", like + # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" + titles = attributes.get('titles', []) or [] + title, original_language_title, subtitle = parse_datacite_titles( + titles) + + if title is None: + print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + return False + + title = clean(title) + if not title: + print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) + return False + + if not subtitle: + subtitle = None + else: + subtitle = clean(subtitle) + + # Dates. A few internal dates (registered, created, updated) and + # published (0..2554). We try to work with typed date list, in + # "attributes.dates[].dateType", values: "Accepted", "Available" + # "Collected", "Copyrighted", "Created", "Issued", "Submitted", + # "Updated", "Valid". + release_date, release_month, release_year = parse_datacite_dates( + attributes.get('dates', [])) + + # Start with clear stages, e.g. published. TODO(martin): we could + # probably infer a bit more from the relations, e.g. + # "IsPreviousVersionOf" or "IsNewVersionOf". + release_stage = 'published' + + # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, + # we might want something else than 'published'. See also: + # https://support.datacite.org/docs/doi-states. + + # Publisher. A few NA values. A few bogus values. + publisher = attributes.get('publisher') + + if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): + publisher = None + release_stage = None + if publisher is not None and len(publisher) > 80: + # Arbitrary magic value max length. TODO(martin): better heuristic, + # but factored out; first we have to log misses. Example: + # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, + # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / + # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt + # werden" + publisher = None + + if publisher: + publisher = clean(publisher) + + # Container. For the moment, only ISSN as container. + container_id = None + container_name = None + + container = attributes.get('container', {}) or {} + if container.get('type') in CONTAINER_TYPE_MAP.keys(): + container_type = CONTAINER_TYPE_MAP.get(container['type']) + if container.get('identifier') and container.get( + 'identifierType') == 'ISSN': + issn = container.get('identifier') + if len(issn) == 8: + issn = issn[:4] + "-" + issn[4:] + issnl = self.issn2issnl(issn) + if issnl is not None: + container_id = self.lookup_issnl(issnl) + + if container_id is None and container.get('title'): + container_name = container.get('title') + if isinstance(container_name, list): + if len(container_name) > 0: + print('[{}] too many container titles: {}'.format(doi, + len(container_name))) + container_name = container_name[0] + assert isinstance(container_name, str) + ce = fatcat_openapi_client.ContainerEntity( + issnl=issnl, + container_type=container_type, + name=container_name, + ) + ce_edit = self.create_container(ce) + container_id = ce_edit.ident + self._issnl_id_map[issnl] = container_id + else: + # TODO(martin): factor this out into a testable function. + # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 + container_name = container.get('title') + if isinstance(container_name, list): + if len(container_name) > 0: + print('[{}] too many container titles: {}'.format(doi, + len(container_name))) + container_name = container_name[0] + + # Volume and issue. + volume = container.get('volume') + issue = container.get('issue') + + if volume: + volume = clean(volume) + + if issue: + issue = clean(issue) + + # Pages. + pages = None + + first_page = container.get('firstPage') + last_page = container.get('lastPage') + + if first_page and last_page: + try: + _ = int(first_page) < int(last_page) + pages = '{}-{}'.format(first_page, last_page) + except ValueError as err: + # TODO(martin): This is more debug than info. + # print('[{}] {}'.format(doi, err), file=sys.stderr) + pass + + if not pages and first_page: + pages = first_page + + # License. + license_slug = None + license_extra = [] + + for l in attributes.get('rightsList', []): + slug = lookup_license_slug(l.get('rightsUri')) + if slug: + license_slug = slug + license_extra.append(l) + + # Release type. Try to determine the release type from a variety of + # types supplied in datacite. The "attributes.types.resourceType" is + # uncontrolled (170000+ unique values, from "null", "Dataset" to + # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP + # flows in 2009") citeproc may be the closest, but not always supplied. + # Order lookup roughly by completeness of mapping. + for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): + value = attributes.get('types', {}).get(typeType) + release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) + if release_type is not None: + break + + if release_type is None: + print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) + + # release_type exception: Global Biodiversity Information Facility + # publishes highly interesting datasets, but titles are mostly the same + # ("GBIF Occurrence Download" or "Occurrence Download"); set + # release_type to "stub" (CSL/FC). + if publisher == 'The Global Biodiversity Information Facility': + release_type = 'stub' + + # release_type exception: lots of "Experimental Crystal Structure Determination" + if publisher == 'Cambridge Crystallographic Data Centre': + release_type = 'entry' + + # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." + if title.lower().startswith('additional file'): + release_type = 'stub' + + # Language values are varied ("ger", "es", "English", "ENG", "en-us", + # "other", ...). Try to crush it with langcodes: "It may sound to you + # like langcodes solves a pretty boring problem. At one level, that's + # right. Sometimes you have a boring problem, and it's great when a + # library solves it for you." -- TODO(martin): We need more of these. + language = None + + value = attributes.get('language', '') or '' + try: + language = pycountry.languages.lookup(value).alpha_2 + except (LookupError, AttributeError) as err: + pass + # TODO(martin): Print this on debug level, only. + # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) + + # Abstracts appear in "attributes.descriptions[].descriptionType", some + # of the observed values: "Methods", "TechnicalInfo", + # "SeriesInformation", "Other", "TableOfContents", "Abstract". The + # "Other" fields might contain references or related articles (with + # DOI). TODO(martin): maybe try to parse out some of those refs. + abstracts = [] + descs = attributes.get('descriptions', []) or [] + for desc in descs: + if not desc.get('descriptionType') == 'Abstract': + continue + if len(desc.get('description', '') or '') < 10: + continue + text = desc.get('description', '') + if len(text) > MAX_ABSTRACT_LENGTH: + text = text[:MAX_ABSTRACT_LENGTH] + " [...]" + lang = None + try: + lang = langdetect.detect(text) + except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: + print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) + abstracts.append( + fatcat_openapi_client.ReleaseAbstract( + mimetype="text/plain", + content=clean(text), + lang=lang, + )) + + # References and relations. Datacite include many relation types in + # "attributes.relatedIdentifiers[].relationType", e.g. + # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", + # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", + # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", + # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", + # "IsDerivedFrom", "IsSourceOf". + # + # For the moment, we only care about References. + refs, ref_index = [], 0 + + relIds = attributes.get('relatedIdentifiers', []) or [] + for rel in relIds: + if not rel.get('relationType', '') in ('References', 'Cites'): + continue + ref_extra = dict() + if rel.get('relatedIdentifierType', '') == 'DOI': + ref_extra['doi'] = rel.get('relatedIdentifier') + if not ref_extra: + ref_extra = None + refs.append( + fatcat_openapi_client.ReleaseRef( + index=ref_index, + extra=ref_extra, + )) + ref_index += 1 + + # More specific release_type via 'Reviews' relationsship. + for rel in relIds: + if rel.get('relatedIdentifierType', '') != 'Reviews': + continue + release_type = 'review' + + # Extra information. + extra_datacite = dict() + + if license_extra: + extra_datacite['license'] = license_extra + if attributes.get('subjects'): + extra_datacite['subjects'] = attributes['subjects'] + + # Include version information. + metadata_version = attributes.get('metadataVersion') or '' + schema_version = attributes.get('schemaVersion') or '' + + if metadata_version: + extra_datacite['metadataVersion'] = metadata_version + if schema_version: + extra_datacite['schemaVersion'] = schema_version + + # Include resource types. + types = attributes.get('types', {}) or {} + resource_type = types.get('resourceType', '') or '' + resource_type_general = types.get('resourceTypeGeneral', '') or '' + + if resource_type: + extra_datacite['resourceType'] = resource_type + if resource_type_general: + extra_datacite['resourceTypeGeneral'] = resource_type_general + + # Include certain relations from relatedIdentifiers. Keeping the + # original structure of data here, which is a list of dicts, with + # relation type, identifer and identifier type (mostly). + relations = [] + for rel in relIds: + if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', + 'IsVariantFormOf', 'IsSupplementTo', + 'HasVersion', 'IsMetadataFor', + 'IsNewVersionOf', 'IsIdenticalTo', + 'IsVersionOf', 'IsDerivedFrom', + 'IsSourceOf'): + relations.append(rel) + + if relations: + extra_datacite['relations'] = relations + + extra = dict() + + # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", + # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", + # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st + # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, + # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", + # "10161", "10010691", "10780", # "Presentación" + version = attributes.get('version') + + # top-level extra keys + if not container_id and container_name: + extra['container_name'] = container_name + + # Always include datacite key, even if value is empty (dict). + extra['datacite'] = extra_datacite + + # Preparation for a schema update. + if release_month: + extra['release_month'] = release_month + + extids = self.lookup_ext_ids(doi=doi) + + # Assemble release. + re = fatcat_openapi_client.ReleaseEntity( + work_id=None, + container_id=container_id, + release_type=release_type, + release_stage=release_stage, + title=title, + subtitle=subtitle, + original_title=original_language_title, + release_year=release_year, + release_date=release_date, + publisher=publisher, + ext_ids=fatcat_openapi_client.ReleaseExtIds( + doi=doi, + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + core=extids['core_id'], + arxiv=extids['arxiv_id'], + jstor=extids['jstor_id'], + ), + contribs=contribs, + volume=volume, + issue=issue, + pages=pages, + language=language, + abstracts=abstracts, + refs=refs, + extra=extra, + license_slug=license_slug, + version=version, + ) + return re + + def try_update(self, re): + """ + When debug is true, write the RE to stdout, not to the database. Might + hide schema mismatch bugs. + """ + if self.debug is True: + print(json.dumps(entity_to_dict(re, api_client=None))) + return False + + # lookup existing DOI (don't need to try other ext idents for crossref) + existing = None + try: + existing = self.api.lookup_release(doi=re.ext_ids.doi) + except fatcat_openapi_client.rest.ApiException as err: + if err.status != 404: + raise err + # doesn't exist, need to update + return True + + # eventually we'll want to support "updates", but for now just skip if + # entity already exists + if existing: + self.counts['exists'] += 1 + return False + + return True + + def insert_batch(self, batch): + print('inserting batch ({})'.format(len(batch)), file=sys.stderr) + if self.insert_log_file: + with open(self.insert_log_file, 'a') as f: + for doc in batch: + json.dump(entity_to_dict(doc, api_client=None), f) + f.write('\n') + self.api.create_release_auto_batch( + fatcat_openapi_client.ReleaseAutoBatch( + editgroup=fatcat_openapi_client.Editgroup( + description=self.editgroup_description, + extra=self.editgroup_extra), + entity_list=batch)) + + def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): + """ + Parses a list of creators into a list of ReleaseContrib objects. Set + set_index to False, if the index contrib field should be left blank. + The doi parameter is only used for debugging. + """ + # Contributors. Many nameIdentifierSchemes, we do not use (yet): + # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": + # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", + # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. + contribs = [] + + # Names, that should be ignored right away. + name_blacklist = set(('Occdownload Gbif.Org',)) + + for i, c in enumerate(creators): + if not set_index: + i = None + nameType = c.get('nameType', '') or '' + if nameType in ('', 'Personal'): + creator_id = None + for nid in c.get('nameIdentifiers', []): + name_scheme = nid.get('nameIdentifierScheme', '') or '' + if not name_scheme.lower() == "orcid": + continue + orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') + if not orcid: + continue + creator_id = self.lookup_orcid(orcid) + # TODO(martin): If creator_id is None, should we create creators? + + # If there are multiple affiliation strings, use the first one. + affiliations = c.get('affiliation', []) or [] + raw_affiliation = None + if len(affiliations) == 0: + raw_affiliation = None + else: + raw_affiliation = clean(affiliations[0]) + + name = c.get('name') + given_name = c.get('givenName') + surname = c.get('familyName') + + if name: + name = clean(name) + if not name: + continue + if name in name_blacklist: + continue + if name.lower() in UNKNOWN_MARKERS: + continue + # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. + if name: + name = index_form_to_display_name(name) + + if given_name: + given_name = clean(given_name) + if surname: + surname = clean(surname) + if raw_affiliation == '': + continue + + extra = None + + # "DataManager", "DataCurator", "ContactPerson", "Distributor", + # "RegistrationAgency", "Sponsor", "Researcher", + # "RelatedPerson", "ProjectLeader", "Editor", "Other", + # "ProjectMember", "Funder", "RightsHolder", "DataCollector", + # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" + contributorType = c.get('contributorType', '') or '' + + if contributorType: + extra = {'type': contributorType} + + contribs.append( + fatcat_openapi_client.ReleaseContrib( + creator_id=creator_id, + index=i, + raw_name=name, + given_name=given_name, + surname=surname, + role=role, + raw_affiliation=raw_affiliation, + extra=extra, + )) + elif nameType == 'Organizational': + name = c.get('name', '') or '' + if name in UNKNOWN_MARKERS: + continue + if len(name) < 3: + continue + extra = {'organization': name} + contribs.append(fatcat_openapi_client.ReleaseContrib( + index=i, extra=extra)) + else: + print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + + return contribs + + +def lookup_license_slug(raw): + """ + TODO(martin): reuse from or combine with crossref, maybe. + """ + if not raw: + return None + raw = raw.strip().replace('http://', '//').replace('https://', '//') + if 'creativecommons.org' in raw.lower(): + raw = raw.lower() + raw = raw.replace('/legalcode', '/').replace('/uk', '') + if not raw.endswith('/'): + raw = raw + '/' + return LICENSE_SLUG_MAP.get(raw) + + +def find_original_language_title(item, min_length=4, max_questionmarks=3): + """ + Perform a few checks before returning a potential original language title. + + Example input: {'title': 'Some title', 'original_language_title': 'Some title'} + """ + if not 'original_language_title' in item: + return None + title = item.get('title') + if not title: + return None + original_language_title = item.get('original_language_title') + if isinstance(original_language_title, + str) and title != original_language_title: + if len(original_language_title) < min_length: + return None + if original_language_title.count('?') > max_questionmarks: + return None + return original_language_title + if isinstance(original_language_title, dict): + content = original_language_title.get('__content__', '') or '' + if content and content != title and not content.count( + '?') > max_questionmarks: + return content + return None + + +def parse_datacite_titles(titles): + """ + Given a list of title items from datacite, return 3-tuple (title, + original_language_title, subtitle). + + Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}] + """ + title, original_language_title, subtitle = None, None, None + + if titles is None: + return title, original_language_title, subtitle + if len(titles) == 0: + return title, original_language_title, subtitle + elif len(titles) == 1: + original_language_title = find_original_language_title(titles[0]) + title = titles[0].get('title', '') or '' + title = title.strip() + if not title: + title = None + return title, original_language_title, subtitle + else: + for entry in titles: + if not title and ('titleType' not in entry + or not entry.get('titleType')): + title = entry.get('title').strip() + if not subtitle and entry.get('titleType') == 'Subtitle': + subtitle = entry.get('title', '').strip() + if not original_language_title: + original_language_title = find_original_language_title(entry) + + return title, original_language_title, subtitle + + +def parse_datacite_dates(dates): + """ + Given a list of date fields (under .dates), return tuple, (release_date, + release_year). + """ + release_date, release_month, release_year = None, None, None + + if not dates: + return release_date, release_month, release_year + + if not isinstance(dates, list): + raise ValueError('expected a list of date items') + + # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", + # "Collected", "Updated", "Copyrighted", "Created" + # Ignored for now: "Collected", "Issued" + date_type_prio = ( + 'Valid', + 'Available', + 'Accepted', + 'Submitted', + 'Copyrighted', + 'Created', + 'Updated', + ) + + # We need to note the granularity, since a string like "2019" would be + # parsed into "2019-01-01", even though the month is unknown. Use 3 + # granularity types: 'y', 'm', 'd'. + Pattern = collections.namedtuple('Pattern', 'layout granularity') + + # Before using (expensive) dateparser, try a few common patterns. + common_patterns = ( + Pattern('%Y-%m-%d', 'd'), + Pattern('%Y-%m', 'm'), + Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), + Pattern('%Y-%m-%dT%H:%M:%S', 'd'), + Pattern('%Y', 'y'), + ) + + def parse_item(item): + result, value, year_only = None, item.get('date', ''), False + release_date, release_month, release_year = None, None, None + + for layout, granularity in common_patterns: + try: + result = datetime.datetime.strptime(value, layout) + except ValueError: + continue + else: + if granularity == 'y': + year_only = True + break + + if result is None: + print('fallback for {}'.format(value), file=sys.stderr) + parser = dateparser.DateDataParser() + try: + # Results in a dict with keys: date_obj, period, locale. + parse_result = parser.get_date_data(value) + + # A datetime object, later we need a date, only. + result = parse_result['date_obj'] + if result is not None: + if parse_result['period'] == 'year': + return None, None, result.year + elif parse_result['period'] == 'month': + return None, result.month, result.year + else: + return result.date(), result.month, result.year + except TypeError as err: + print("{} date parsing failed with: {}".format(value, err), + file=sys.stderr) + + if result is None: + # Unparsable date. + return release_date, release_month, release_year + + if granularity != 'y': + release_date = result.date() + release_year = result.year + if granularity in ('m', 'd'): + release_month = result.month + + return release_date, release_month, release_year + + today = datetime.date.today() + + for prio in date_type_prio: + for item in dates: + if not item.get('dateType') == prio: + continue + + release_date, release_month, release_year = parse_item(item) + if release_date is None and release_year is None: + continue + + if release_year < 1000 or release_year > today.year + 5: + # Skip possibly bogus dates. + release_year = None + continue + break + else: + continue + break + + if release_date is None and release_year is None: + for item in dates: + release_date, release_month, release_year = parse_item(item) + if release_year or release_date: + break + + return release_date, release_month, release_year + +def index_form_to_display_name(s): + """ + Try to convert an index form name, like 'Razis, Panos A' into display_name, + e.g. 'Panos A Razis'. + """ + if ',' not in s: + return s + skip_on_chars = ['(', ')', '*'] + for char in skip_on_chars: + if char in s: + return s + if s.count(',') > 1: + # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" + return s + + # Not names, but sprinkled in fields where authors live. + stopwords = [s.lower() for s in ( + 'Archive', + 'Collection', + 'Coordinator', + 'Department', + 'Germany', + 'International', + 'National', + 'Netherlands', + 'Office', + 'Organisation', + 'Organization', + 'Service', + 'Services', + 'United States', + 'University', + 'Verein', + 'Volkshochschule', + )] + lower = s.lower() + for stop in stopwords: + if stop in lower: + return s + + a, b = s.split(',') + return '{} {}'.format(b.strip(), a.strip()) |