diff options
Diffstat (limited to 'python/fatcat_tools/importers')
| -rw-r--r-- | python/fatcat_tools/importers/__init__.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 1023 | 
2 files changed, 1024 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py index bb9c5b17..d936605f 100644 --- a/python/fatcat_tools/importers/__init__.py +++ b/python/fatcat_tools/importers/__init__.py @@ -14,6 +14,7 @@ To run an import you combine two classes; one each of:  from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC  from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug +from .datacite import DataciteImporter  from .jalc import JalcImporter  from .jstor import JstorImporter  from .arxiv import ArxivRawImporter diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py new file mode 100644 index 00000000..b1862b44 --- /dev/null +++ b/python/fatcat_tools/importers/datacite.py @@ -0,0 +1,1023 @@ +""" +Prototype importer for datacite.org data. + +Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8. + +Datacite being an aggregator, the data is varied and exposes a couple of +problems in content and structure. A few fields habe their own parsing +functions (parse_datacite_...), which can be tested more easily. +""" + +import collections +import datetime +import hashlib +import json +import sqlite3 +import sys + +import dateparser +import fatcat_openapi_client +import langdetect +import pycountry + +from fatcat_tools.normal import clean_doi +from fatcat_tools.transforms import entity_to_dict + +from .common import EntityImporter, clean + +# Cutoff length for abstracts. +MAX_ABSTRACT_LENGTH = 2048 + +# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary +CONTAINER_TYPE_MAP = { +    'Journal': 'journal', +    'Series': 'journal', +    'Book Series': 'book-series', +} + +# The docs/guide should be the cannonical home for these mappings; update there +# first.  Map various datacite type types to CSL-ish types. None means TODO or +# remove. +DATACITE_TYPE_MAP = { +    'ris': { +        'THES': 'thesis', +        'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report) +        'CHAP': 'chapter', +        'FIGURE': 'figure', +        'RPRT': 'report', +        'JOUR': 'article-journal', +        'MPCT': 'motion_picture', +        'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset +        'BOOK': 'book', +        'DATA': 'dataset', +        'COMP': 'software', +    }, +    'schemaOrg': { +        'Dataset': 'dataset', +        'Book': 'book', +        'ScholarlyArticle': 'article-journal', +        'ImageObject': 'graphic', +        'Collection': None, +        'MediaObject': None, +        'Event': None, +        'SoftwareSourceCode': 'software', +        'Chapter': 'chapter', +        'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score. +        'PublicationIssue': 'article', +        'AudioObject': None, +        'Thesis': 'thesis', +    }, +    'citeproc': { +        'article': 'article', +        'article-journal': 'article-journal', +        'article-magazine': 'article-magazine', +        'article-newspaper': 'article-newspaper', +        'bill': 'bill', +        'book': 'book', +        'broadcast': 'broadcast', +        'chapter': 'chapter', +        'dataset': 'dataset', +        'entry-dictionary': 'entry-dictionary', +        'entry-encyclopedia': 'entry-encyclopedia', +        'entry': 'entry', +        'figure': 'figure', +        'graphic': 'graphic', +        'interview': 'interview', +        'legal_case': 'legal_case', +        'legislation': 'legislation', +        'manuscript': 'manuscript', +        'map': 'map', +        'motion_picture': 'motion_picture', +        'musical_score': 'musical_score', +        'pamphlet': 'pamphlet', +        'paper-conference': 'paper-conference', +        'patent': 'patent', +        'personal_communication': 'personal_communication', +        'post': 'post', +        'post-weblog': 'post-weblog', +        'report': 'report', +        'review-book': 'review-book', +        'review': 'review', +        'song': 'song', +        'speech': 'speech', +        'thesis': 'thesis', +        'treaty': 'treaty', +        'webpage': 'webpage', +    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types +    'bibtex': { +        'phdthesis': 'thesis', +        'inbook': 'chapter', +        'misc': None, +        'article': 'article-journal', +        'book': 'book', +    }, +    'resourceTypeGeneral': { +        'Image': 'graphic', +        'Dataset': 'dataset', +        'PhysicalObject': None, +        'Collection': None, +        'Text': None, # "Greyliterature, labnotes, accompanyingmaterials" +        'Sound': None, +        'InteractiveResource': None, +        'Event': None, +        'Software': 'software', +        'Other': None, +        'Workflow': None, +        'Audiovisual': None, +    } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32 +} + +# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43. +DATACITE_UNKNOWN_MARKERS = ( +    '(:unac)',  # temporarily inaccessible +    '(:unal)',  # unallowed, suppressed intentionally +    '(:unap)',  # not applicable, makes no sense +    '(:unas)',  # value unassigned (e.g., Untitled) +    '(:unav)',  # value unavailable, possibly unknown +    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue) +    '(:none)',  # never had a value, never will +    '(:null)',  # explicitly and meaningfully empty +    '(:tba)',  # to be assigned or announced later +    '(:etal)',  # too numerous to list (et alia) +) + +# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking +# unknown values. +UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set(( +    'NA', +    'NN', +    'n.a.', +    '[s.n.]', +))) + +# TODO(martin): merge this with other maps, maybe. +LICENSE_SLUG_MAP = { +    "//creativecommons.org/licenses/by/2.0/": "CC-BY", +    "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY", +    "//creativecommons.org/licenses/by/3.0/": "CC-BY", +    "//creativecommons.org/licenses/by/3.0/us": "CC-BY", +    "//creativecommons.org/licenses/by/4.0/": "CC-BY", +    "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY", +    "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY", +    "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY", +    "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC", +    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND", +    "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND", +    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND", +    "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND", +    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA", +    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND", +    "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA", +    "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA", +    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA", +    "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA", +    "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY", +    "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0", +    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0", +    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0", +    "//opensource.org/licenses/MIT": "MIT", +    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0", +    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3", +    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2", +    "//www.karger.com/Services/SiteLicenses": "KARGER", +    "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0", +    "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause", +    "//www.opensource.org/licenses/EUPL-1.1": +    "EUPL-1.1",  # redirects to EUPL-1.2 +    "//www.opensource.org/licenses/MIT": "MIT", +    # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/ +    # "http://rsc.li/journals-terms-of-use": "RSC", +    # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG. +    # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "", +    # "http://www.springer.com/tdm": "", +    # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html +    # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "", +    # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "", +    # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "", +    # "https://rightsstatements.org/page/InC/1.0?language=en": "", +    # "https://services.ceda.ac.uk/cedasite/register/info": "", +    # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404 +    # "https://www.cambridge.org/core/terms": "", +    # "https://www.elsevier.com/tdm/userlicense/1.0", +    # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights +    # "info:eu-repo/semantics/embargoedAccess": "", +    # "info:eu-repo/semantics/openAccess": "", +    # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice). +} + +# TODO(martin): drop this after 3.7 upgrade +try: +    isascii = str.isascii # new in 3.7, https://docs.python.org/3/library/stdtypes.html#str.isascii +except AttributeError: +    isascii = lambda s: len(s) == len(s.encode()) + + +class DataciteImporter(EntityImporter): +    """ +    Importer for datacite records. +    """ +    def __init__(self, +                 api, +                 issn_map_file, +                 debug=False, +                 insert_log_file=None, +                 **kwargs): + +        eg_desc = kwargs.get( +            'editgroup_description', +            "Automated import of Datacite DOI metadata, harvested from REST API" +        ) +        eg_extra = kwargs.get('editgroup_extra', dict()) +        eg_extra['agent'] = eg_extra.get('agent', +                                         'fatcat_tools.DataciteImporter') +        super().__init__(api, +                         issn_map_file=issn_map_file, +                         editgroup_description=eg_desc, +                         editgroup_extra=eg_extra, +                         **kwargs) + +        self.create_containers = kwargs.get('create_containers', True) +        extid_map_file = kwargs.get('extid_map_file') +        self.extid_map_db = None +        if extid_map_file: +            db_uri = "file:{}?mode=ro".format(extid_map_file) +            print("Using external ID map: {}".format(db_uri), file=sys.stderr) +            self.extid_map_db = sqlite3.connect(db_uri, uri=True) +        else: +            print("Not using external ID map", file=sys.stderr) + +        self.read_issn_map_file(issn_map_file) +        self.debug = debug +        self.insert_log_file = insert_log_file + +        print('datacite with debug={}'.format(self.debug), file=sys.stderr) + +    def lookup_ext_ids(self, doi): +        """ +        Return dictionary of identifiers refering to the same things as the given DOI. +        """ +        if self.extid_map_db is None: +            return dict(core_id=None, +                        pmid=None, +                        pmcid=None, +                        wikidata_qid=None, +                        arxiv_id=None, +                        jstor_id=None) +        row = self.extid_map_db.execute( +            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", +            [doi.lower()]).fetchone() +        if row is None: +            return dict(core_id=None, +                        pmid=None, +                        pmcid=None, +                        wikidata_qid=None, +                        arxiv_id=None, +                        jstor_id=None) +        row = [str(cell or '') or None for cell in row] +        return dict( +            core_id=row[0], +            pmid=row[1], +            pmcid=row[2], +            wikidata_qid=row[3], +            # TODO: +            arxiv_id=None, +            jstor_id=None, +        ) + +    def parse_record(self, obj): +        """ +        Mapping datacite JSON to ReleaseEntity. +        """ +        if not obj or not isinstance(obj, dict): +            return None +        if 'attributes' not in obj: +            return None + +        attributes = obj['attributes'] +        doi = clean_doi(attributes.get('doi', '').lower()) + +        if not isascii(doi): +            print('[{}] skipping non-ascii doi for now'.format(doi)) +            return None + + +        creators = attributes.get('creators', []) or [] +        contributors = attributes.get('contributors', []) or []  # Much fewer than creators. + +        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi) + +        # Title, may come with "attributes.titles[].titleType", like +        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle" +        titles = attributes.get('titles', []) or [] +        title, original_language_title, subtitle = parse_datacite_titles( +            titles) + +        if title is None: +            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) +            return False + +        title = clean(title) +        if not title: +            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr) +            return False + +        if not subtitle: +            subtitle = None +        else: +            subtitle = clean(subtitle) + +        # Dates. A few internal dates (registered, created, updated) and +        # published (0..2554). We try to work with typed date list, in +        # "attributes.dates[].dateType", values: "Accepted", "Available" +        # "Collected", "Copyrighted", "Created", "Issued", "Submitted", +        # "Updated", "Valid". +        release_date, release_month, release_year = parse_datacite_dates( +            attributes.get('dates', [])) + +        # Start with clear stages, e.g. published. TODO(martin): we could +        # probably infer a bit more from the relations, e.g. +        # "IsPreviousVersionOf" or "IsNewVersionOf". +        release_stage = 'published' + +        # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true, +        # we might want something else than 'published'. See also: +        # https://support.datacite.org/docs/doi-states. + +        # Publisher. A few NA values. A few bogus values. +        publisher = attributes.get('publisher') + +        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')): +            publisher = None +            release_stage = None +        if publisher is not None and len(publisher) > 80: +            # Arbitrary magic value max length. TODO(martin): better heuristic, +            # but factored out; first we have to log misses. Example: +            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, +            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / +            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt +            # werden" +            publisher = None + +        if publisher: +            publisher = clean(publisher) + +        # Container. For the moment, only ISSN as container. +        container_id = None +        container_name = None + +        container = attributes.get('container', {}) or {} +        if container.get('type') in CONTAINER_TYPE_MAP.keys(): +            container_type = CONTAINER_TYPE_MAP.get(container['type']) +            if container.get('identifier') and container.get( +                    'identifierType') == 'ISSN': +                issn = container.get('identifier') +                if len(issn) == 8: +                    issn = issn[:4] + "-" + issn[4:] +                issnl = self.issn2issnl(issn) +                if issnl is not None: +                    container_id = self.lookup_issnl(issnl) + +                    if container_id is None and container.get('title'): +                        container_name = container.get('title') +                        if isinstance(container_name, list): +                            if len(container_name) > 0: +                                print('[{}] too many container titles: {}'.format(doi, +                                    len(container_name))) +                                container_name = container_name[0] +                        assert isinstance(container_name, str) +                        ce = fatcat_openapi_client.ContainerEntity( +                            issnl=issnl, +                            container_type=container_type, +                            name=container_name, +                        ) +                        ce_edit = self.create_container(ce) +                        container_id = ce_edit.ident +                        self._issnl_id_map[issnl] = container_id +                else: +                    # TODO(martin): factor this out into a testable function. +                    # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013 +                    container_name = container.get('title') +                    if isinstance(container_name, list): +                        if len(container_name) > 0: +                            print('[{}] too many container titles: {}'.format(doi, +                                len(container_name))) +                            container_name = container_name[0] + +        # Volume and issue. +        volume = container.get('volume') +        issue = container.get('issue') + +        if volume: +            volume = clean(volume) + +        if issue: +            issue = clean(issue) + +        # Pages. +        pages = None + +        first_page = container.get('firstPage') +        last_page = container.get('lastPage') + +        if first_page and last_page: +            try: +                _ = int(first_page) < int(last_page) +                pages = '{}-{}'.format(first_page, last_page) +            except ValueError as err: +                # TODO(martin): This is more debug than info. +                # print('[{}] {}'.format(doi, err), file=sys.stderr) +                pass + +        if not pages and first_page: +            pages = first_page + +        # License. +        license_slug = None +        license_extra = [] + +        for l in attributes.get('rightsList', []): +            slug = lookup_license_slug(l.get('rightsUri')) +            if slug: +                license_slug = slug +            license_extra.append(l) + +        # Release type. Try to determine the release type from a variety of +        # types supplied in datacite. The "attributes.types.resourceType" is +        # uncontrolled (170000+ unique values, from "null", "Dataset" to +        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP +        # flows in 2009") citeproc may be the closest, but not always supplied. +        # Order lookup roughly by completeness of mapping. +        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): +            value = attributes.get('types', {}).get(typeType) +            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) +            if release_type is not None: +                break + +        if release_type is None: +            print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) + +        # release_type exception: Global Biodiversity Information Facility +        # publishes highly interesting datasets, but titles are mostly the same +        # ("GBIF Occurrence Download" or "Occurrence Download"); set +        # release_type to "stub" (CSL/FC). +        if publisher == 'The Global Biodiversity Information Facility': +            release_type = 'stub' + +        # release_type exception: lots of "Experimental Crystal Structure Determination" +        if publisher == 'Cambridge Crystallographic Data Centre': +            release_type = 'entry' + +        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire." +        if title.lower().startswith('additional file'): +            release_type = 'stub' + +        # Language values are varied ("ger", "es", "English", "ENG", "en-us", +        # "other", ...). Try to crush it with langcodes: "It may sound to you +        # like langcodes solves a pretty boring problem. At one level, that's +        # right. Sometimes you have a boring problem, and it's great when a +        # library solves it for you." -- TODO(martin): We need more of these. +        language = None + +        value = attributes.get('language', '') or '' +        try: +            language = pycountry.languages.lookup(value).alpha_2 +        except (LookupError, AttributeError) as err: +            pass +            # TODO(martin): Print this on debug level, only. +            # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr) + +        # Abstracts appear in "attributes.descriptions[].descriptionType", some +        # of the observed values: "Methods", "TechnicalInfo", +        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The +        # "Other" fields might contain references or related articles (with +        # DOI). TODO(martin): maybe try to parse out some of those refs. +        abstracts = [] +        descs = attributes.get('descriptions', []) or [] +        for desc in descs: +            if not desc.get('descriptionType') == 'Abstract': +                continue +            if len(desc.get('description', '') or '') < 10: +                continue +            text = desc.get('description', '') +            if len(text) > MAX_ABSTRACT_LENGTH: +                text = text[:MAX_ABSTRACT_LENGTH] + " [...]" +            lang = None +            try: +                lang = langdetect.detect(text) +            except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err: +                print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr) +            abstracts.append( +                fatcat_openapi_client.ReleaseAbstract( +                    mimetype="text/plain", +                    content=clean(text), +                    lang=lang, +                )) + +        # References and relations. Datacite include many relation types in +        # "attributes.relatedIdentifiers[].relationType", e.g. +        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", +        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", +        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", +        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", +        # "IsDerivedFrom", "IsSourceOf". +        # +        # For the moment, we only care about References. +        refs, ref_index = [], 0 + +        relIds = attributes.get('relatedIdentifiers', []) or [] +        for rel in relIds: +            if not rel.get('relationType', '') in ('References', 'Cites'): +                continue +            ref_extra = dict() +            if rel.get('relatedIdentifierType', '') == 'DOI': +                ref_extra['doi'] = rel.get('relatedIdentifier') +            if not ref_extra: +                ref_extra = None +            refs.append( +                fatcat_openapi_client.ReleaseRef( +                    index=ref_index, +                    extra=ref_extra, +                )) +            ref_index += 1 + +        # More specific release_type via 'Reviews' relationsship. +        for rel in relIds: +            if rel.get('relatedIdentifierType', '') != 'Reviews': +                continue +            release_type = 'review' + +        # Extra information. +        extra_datacite = dict() + +        if license_extra: +            extra_datacite['license'] = license_extra +        if attributes.get('subjects'): +            extra_datacite['subjects'] = attributes['subjects'] + +        # Include version information. +        metadata_version = attributes.get('metadataVersion') or '' +        schema_version = attributes.get('schemaVersion') or '' + +        if metadata_version: +            extra_datacite['metadataVersion'] = metadata_version +        if schema_version: +            extra_datacite['schemaVersion'] = schema_version + +        # Include resource types. +        types = attributes.get('types', {}) or {} +        resource_type = types.get('resourceType', '') or '' +        resource_type_general = types.get('resourceTypeGeneral', '') or '' + +        if resource_type: +            extra_datacite['resourceType'] = resource_type +        if resource_type_general: +            extra_datacite['resourceTypeGeneral'] = resource_type_general + +        # Include certain relations from relatedIdentifiers. Keeping the +        # original structure of data here, which is a list of dicts, with +        # relation type, identifer and identifier type (mostly). +        relations = [] +        for rel in relIds: +            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues', +                                           'IsVariantFormOf', 'IsSupplementTo', +                                           'HasVersion', 'IsMetadataFor', +                                           'IsNewVersionOf', 'IsIdenticalTo', +                                           'IsVersionOf', 'IsDerivedFrom', +                                           'IsSourceOf'): +                relations.append(rel) + +        if relations: +            extra_datacite['relations'] = relations + +        extra = dict() + +        # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0", +        # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555", +        # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st +        # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null, +        # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0", +        # "10161", "10010691", "10780", # "Presentación" +        version = attributes.get('version') + +        # top-level extra keys +        if not container_id and container_name: +            extra['container_name'] = container_name + +        # Always include datacite key, even if value is empty (dict). +        extra['datacite'] = extra_datacite + +        # Preparation for a schema update. +        if release_month: +            extra['release_month'] = release_month + +        extids = self.lookup_ext_ids(doi=doi) + +        # Assemble release. +        re = fatcat_openapi_client.ReleaseEntity( +            work_id=None, +            container_id=container_id, +            release_type=release_type, +            release_stage=release_stage, +            title=title, +            subtitle=subtitle, +            original_title=original_language_title, +            release_year=release_year, +            release_date=release_date, +            publisher=publisher, +            ext_ids=fatcat_openapi_client.ReleaseExtIds( +                doi=doi, +                pmid=extids['pmid'], +                pmcid=extids['pmcid'], +                wikidata_qid=extids['wikidata_qid'], +                core=extids['core_id'], +                arxiv=extids['arxiv_id'], +                jstor=extids['jstor_id'], +            ), +            contribs=contribs, +            volume=volume, +            issue=issue, +            pages=pages, +            language=language, +            abstracts=abstracts, +            refs=refs, +            extra=extra, +            license_slug=license_slug, +            version=version, +        ) +        return re + +    def try_update(self, re): +        """ +        When debug is true, write the RE to stdout, not to the database. Might +        hide schema mismatch bugs. +        """ +        if self.debug is True: +            print(json.dumps(entity_to_dict(re, api_client=None))) +            return False + +        # lookup existing DOI (don't need to try other ext idents for crossref) +        existing = None +        try: +            existing = self.api.lookup_release(doi=re.ext_ids.doi) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # doesn't exist, need to update +            return True + +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False + +        return True + +    def insert_batch(self, batch): +        print('inserting batch ({})'.format(len(batch)), file=sys.stderr) +        if self.insert_log_file: +            with open(self.insert_log_file, 'a') as f: +                for doc in batch: +                    json.dump(entity_to_dict(doc, api_client=None), f) +                    f.write('\n') +        self.api.create_release_auto_batch( +            fatcat_openapi_client.ReleaseAutoBatch( +                editgroup=fatcat_openapi_client.Editgroup( +                    description=self.editgroup_description, +                    extra=self.editgroup_extra), +                entity_list=batch)) + +    def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None): +        """ +        Parses a list of creators into a list of ReleaseContrib objects. Set +        set_index to False, if the index contrib field should be left blank. +        The doi parameter is only used for debugging. +        """ +        # Contributors. Many nameIdentifierSchemes, we do not use (yet): +        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": +        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", +        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"]. +        contribs = [] + +        # Names, that should be ignored right away. +        name_blacklist = set(('Occdownload Gbif.Org',)) + +        for i, c in enumerate(creators): +            if not set_index: +                i = None +            nameType = c.get('nameType', '') or '' +            if nameType in ('', 'Personal'): +                creator_id = None +                for nid in c.get('nameIdentifiers', []): +                    name_scheme = nid.get('nameIdentifierScheme', '') or '' +                    if not name_scheme.lower() == "orcid": +                        continue +                    orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '') +                    if not orcid: +                        continue +                    creator_id = self.lookup_orcid(orcid) +                    # TODO(martin): If creator_id is None, should we create creators? + +                # If there are multiple affiliation strings, use the first one. +                affiliations = c.get('affiliation', []) or [] +                raw_affiliation = None +                if len(affiliations) == 0: +                    raw_affiliation = None +                else: +                    raw_affiliation = clean(affiliations[0]) + +                name = c.get('name') +                given_name = c.get('givenName') +                surname = c.get('familyName') + +                if name: +                    name = clean(name) +                if not name: +                    continue +                if name in name_blacklist: +                    continue +                if name.lower() in UNKNOWN_MARKERS: +                    continue +                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'. +                if name: +                    name = index_form_to_display_name(name) + +                if given_name: +                    given_name = clean(given_name) +                if surname: +                    surname = clean(surname) +                if raw_affiliation == '': +                    continue + +                extra = None + +                # "DataManager", "DataCurator", "ContactPerson", "Distributor", +                # "RegistrationAgency", "Sponsor", "Researcher", +                # "RelatedPerson", "ProjectLeader", "Editor", "Other", +                # "ProjectMember", "Funder", "RightsHolder", "DataCollector", +                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup" +                contributorType = c.get('contributorType', '') or '' + +                if contributorType: +                    extra = {'type': contributorType} + +                contribs.append( +                    fatcat_openapi_client.ReleaseContrib( +                        creator_id=creator_id, +                        index=i, +                        raw_name=name, +                        given_name=given_name, +                        surname=surname, +                        role=role, +                        raw_affiliation=raw_affiliation, +                        extra=extra, +                    )) +            elif nameType == 'Organizational': +                name = c.get('name', '') or '' +                if name in UNKNOWN_MARKERS: +                    continue +                if len(name) < 3: +                    continue +                extra = {'organization': name} +                contribs.append(fatcat_openapi_client.ReleaseContrib( +                    index=i, extra=extra)) +            else: +                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr) + +        return contribs + + +def lookup_license_slug(raw): +    """ +    TODO(martin): reuse from or combine with crossref, maybe. +    """ +    if not raw: +        return None +    raw = raw.strip().replace('http://', '//').replace('https://', '//') +    if 'creativecommons.org' in raw.lower(): +        raw = raw.lower() +        raw = raw.replace('/legalcode', '/').replace('/uk', '') +        if not raw.endswith('/'): +            raw = raw + '/' +    return LICENSE_SLUG_MAP.get(raw) + + +def find_original_language_title(item, min_length=4, max_questionmarks=3): +    """ +    Perform a few checks before returning a potential original language title. + +    Example input: {'title': 'Some title', 'original_language_title': 'Some title'} +    """ +    if not 'original_language_title' in item: +        return None +    title = item.get('title') +    if not title: +        return None +    original_language_title = item.get('original_language_title') +    if isinstance(original_language_title, +                  str) and title != original_language_title: +        if len(original_language_title) < min_length: +            return None +        if original_language_title.count('?') > max_questionmarks: +            return None +        return original_language_title +    if isinstance(original_language_title, dict): +        content = original_language_title.get('__content__', '') or '' +        if content and content != title and not content.count( +                '?') > max_questionmarks: +            return content +    return None + + +def parse_datacite_titles(titles): +    """ +    Given a list of title items from datacite, return 3-tuple (title, +    original_language_title, subtitle). + +    Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}] +    """ +    title, original_language_title, subtitle = None, None, None + +    if titles is None: +        return title, original_language_title, subtitle +    if len(titles) == 0: +        return title, original_language_title, subtitle +    elif len(titles) == 1: +        original_language_title = find_original_language_title(titles[0]) +        title = titles[0].get('title', '') or '' +        title = title.strip() +        if not title: +            title = None +        return title, original_language_title, subtitle +    else: +        for entry in titles: +            if not title and ('titleType' not in entry +                              or not entry.get('titleType')): +                title = entry.get('title').strip() +            if not subtitle and entry.get('titleType') == 'Subtitle': +                subtitle = entry.get('title', '').strip() +            if not original_language_title: +                original_language_title = find_original_language_title(entry) + +    return title, original_language_title, subtitle + + +def parse_datacite_dates(dates): +    """ +    Given a list of date fields (under .dates), return tuple, (release_date, +    release_year). +    """ +    release_date, release_month, release_year = None, None, None + +    if not dates: +        return release_date, release_month, release_year + +    if not isinstance(dates, list): +        raise ValueError('expected a list of date items') + +    # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted", +    # "Collected", "Updated", "Copyrighted", "Created" +    # Ignored for now: "Collected", "Issued" +    date_type_prio = ( +        'Valid', +        'Available', +        'Accepted', +        'Submitted', +        'Copyrighted', +        'Created', +        'Updated', +    ) + +    # We need to note the granularity, since a string like "2019" would be +    # parsed into "2019-01-01", even though the month is unknown. Use 3 +    # granularity types: 'y', 'm', 'd'. +    Pattern = collections.namedtuple('Pattern', 'layout granularity') + +    # Before using (expensive) dateparser, try a few common patterns. +    common_patterns = ( +        Pattern('%Y-%m-%d', 'd'), +        Pattern('%Y-%m', 'm'), +        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'), +        Pattern('%Y-%m-%dT%H:%M:%S', 'd'), +        Pattern('%Y', 'y'), +    ) + +    def parse_item(item): +        result, value, year_only = None, item.get('date', ''), False +        release_date, release_month, release_year = None, None, None + +        for layout, granularity in common_patterns: +            try: +                result = datetime.datetime.strptime(value, layout) +            except ValueError: +                continue +            else: +                if granularity == 'y': +                    year_only = True +                break + +        if result is None: +            print('fallback for {}'.format(value), file=sys.stderr) +            parser = dateparser.DateDataParser() +            try: +                # Results in a dict with keys: date_obj, period, locale. +                parse_result = parser.get_date_data(value) + +                # A datetime object, later we need a date, only. +                result = parse_result['date_obj'] +                if result is not None: +                    if parse_result['period'] == 'year': +                        return None, None, result.year +                    elif parse_result['period'] == 'month': +                        return None, result.month, result.year +                    else: +                        return result.date(), result.month, result.year +            except TypeError as err: +                print("{} date parsing failed with: {}".format(value, err), +                      file=sys.stderr) + +        if result is None: +            # Unparsable date. +            return release_date, release_month, release_year + +        if granularity != 'y': +            release_date = result.date() +        release_year = result.year +        if granularity in ('m', 'd'): +            release_month = result.month + +        return release_date, release_month, release_year + +    today = datetime.date.today() + +    for prio in date_type_prio: +        for item in dates: +            if not item.get('dateType') == prio: +                continue + +            release_date, release_month, release_year = parse_item(item) +            if release_date is None and release_year is None: +                continue + +            if release_year < 1000 or release_year > today.year + 5: +                # Skip possibly bogus dates. +                release_year = None +                continue +            break +        else: +            continue +        break + +    if release_date is None and release_year is None: +        for item in dates: +            release_date, release_month, release_year = parse_item(item) +            if release_year or release_date: +                break + +    return release_date, release_month, release_year + +def index_form_to_display_name(s): +    """ +    Try to convert an index form name, like 'Razis, Panos A' into display_name, +    e.g. 'Panos A Razis'. +    """ +    if ',' not in s: +        return s +    skip_on_chars = ['(', ')', '*'] +    for char in skip_on_chars: +        if char in s: +            return s +    if s.count(',') > 1: +        # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan" +        return s + +    # Not names, but sprinkled in fields where authors live. +    stopwords = [s.lower() for s in ( +        'Archive', +        'Collection', +        'Coordinator', +        'Department', +        'Germany', +        'International', +        'National', +        'Netherlands', +        'Office', +        'Organisation', +        'Organization', +        'Service', +        'Services', +        'United States', +        'University', +        'Verein', +        'Volkshochschule', +    )] +    lower = s.lower() +    for stop in stopwords: +        if stop in lower: +            return s + +    a, b = s.split(',') +    return '{} {}'.format(b.strip(), a.strip())  | 
