Merge branch 'martin-datacite-import'

Pipfile.lock is broken. * martin-datacite-import: (68 commits) datacite: pass in doi into factored out method datacite: reformat test cases and use jq . --sort-keys datacite: factor out contributor handling datacite: catch type mismatch in language detection datacite: adjust tests for release_month datacite: name extra.month, extra.release_month datacite: mark additional files as stub datacite: CCDC are entries, mostly datacite: use more specific release_type, if possible datacite: ignore certain names datacite: over 3% records have the same title: stub datacite: fill a few more release_type gaps datacite: adding datacite-specific extra metadata datacite: apply pylint suggestions datacite: fix typos datacite: set release_stage to published by default datacite: month field should be top-level datacite: include month in extra datacite: indicate mismatched file in test datacite: clean abstracts, use unknown value tokens ...
author: Martin Czygan <martin.czygan@gmail.com> 2020-01-08 23:31:40 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-01-08 23:31:40 +0100
commit: 081746837a55bf5f34c96f12f1abb5a00d5b478c (patch)
tree: 88af1ade558ad6695918d36648b3ed4a5bea6954 /python/fatcat_tools/importers
parent: 27723a61bde5591bae8115d801d0d09b7ef01b03 (diff)
parent: 277bd183d7139bb1a8857bc2a48c0aa92012455d (diff)
download: fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.tar.gz
fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.zip
2 files changed, 1024 insertions, 0 deletions
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index bb9c5b17..d936605f 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -14,6 +14,7 @@ To run an import you combine two classes; one each of:
 
 from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
+from .datacite import DataciteImporter
 from .jalc import JalcImporter
 from .jstor import JstorImporter
 from .arxiv import ArxivRawImporter
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
new file mode 100644
index 00000000..b1862b44
--- /dev/null
+++ b/python/fatcat_tools/importers/datacite.py
@@ -0,0 +1,1023 @@
+"""
+Prototype importer for datacite.org data.
+
+Example input document at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8.
+
+Datacite being an aggregator, the data is varied and exposes a couple of
+problems in content and structure. A few fields habe their own parsing
+functions (parse_datacite_...), which can be tested more easily.
+"""
+
+import collections
+import datetime
+import hashlib
+import json
+import sqlite3
+import sys
+
+import dateparser
+import fatcat_openapi_client
+import langdetect
+import pycountry
+
+from fatcat_tools.normal import clean_doi
+from fatcat_tools.transforms import entity_to_dict
+
+from .common import EntityImporter, clean
+
+# Cutoff length for abstracts.
+MAX_ABSTRACT_LENGTH = 2048
+
+# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
+CONTAINER_TYPE_MAP = {
+    'Journal': 'journal',
+    'Series': 'journal',
+    'Book Series': 'book-series',
+}
+
+# The docs/guide should be the cannonical home for these mappings; update there
+# first.  Map various datacite type types to CSL-ish types. None means TODO or
+# remove.
+DATACITE_TYPE_MAP = {
+    'ris': {
+        'THES': 'thesis',
+        'SOUND': 'song', # 99.9% maps to citeproc song, so use that (exception: report)
+        'CHAP': 'chapter',
+        'FIGURE': 'figure',
+        'RPRT': 'report',
+        'JOUR': 'article-journal',
+        'MPCT': 'motion_picture',
+        'GEN': 'article-journal', # GEN consist of 99% article and report, post-weblog, misc - and one dataset
+        'BOOK': 'book',
+        'DATA': 'dataset',
+        'COMP': 'software',
+    },
+    'schemaOrg': {
+        'Dataset': 'dataset',
+        'Book': 'book',
+        'ScholarlyArticle': 'article-journal',
+        'ImageObject': 'graphic',
+        'Collection': None,
+        'MediaObject': None,
+        'Event': None,
+        'SoftwareSourceCode': 'software',
+        'Chapter': 'chapter',
+        'CreativeWork': None, # Seems to be a catch-all resourceType, from PGRFA Material, Pamphlet, to music score.
+        'PublicationIssue': 'article',
+        'AudioObject': None,
+        'Thesis': 'thesis',
+    },
+    'citeproc': {
+        'article': 'article',
+        'article-journal': 'article-journal',
+        'article-magazine': 'article-magazine',
+        'article-newspaper': 'article-newspaper',
+        'bill': 'bill',
+        'book': 'book',
+        'broadcast': 'broadcast',
+        'chapter': 'chapter',
+        'dataset': 'dataset',
+        'entry-dictionary': 'entry-dictionary',
+        'entry-encyclopedia': 'entry-encyclopedia',
+        'entry': 'entry',
+        'figure': 'figure',
+        'graphic': 'graphic',
+        'interview': 'interview',
+        'legal_case': 'legal_case',
+        'legislation': 'legislation',
+        'manuscript': 'manuscript',
+        'map': 'map',
+        'motion_picture': 'motion_picture',
+        'musical_score': 'musical_score',
+        'pamphlet': 'pamphlet',
+        'paper-conference': 'paper-conference',
+        'patent': 'patent',
+        'personal_communication': 'personal_communication',
+        'post': 'post',
+        'post-weblog': 'post-weblog',
+        'report': 'report',
+        'review-book': 'review-book',
+        'review': 'review',
+        'song': 'song',
+        'speech': 'speech',
+        'thesis': 'thesis',
+        'treaty': 'treaty',
+        'webpage': 'webpage',
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
+    'bibtex': {
+        'phdthesis': 'thesis',
+        'inbook': 'chapter',
+        'misc': None,
+        'article': 'article-journal',
+        'book': 'book',
+    },
+    'resourceTypeGeneral': {
+        'Image': 'graphic',
+        'Dataset': 'dataset',
+        'PhysicalObject': None,
+        'Collection': None,
+        'Text': None, # "Greyliterature, labnotes, accompanyingmaterials"
+        'Sound': None,
+        'InteractiveResource': None,
+        'Event': None,
+        'Software': 'software',
+        'Other': None,
+        'Workflow': None,
+        'Audiovisual': None,
+    } # https://schema.datacite.org/meta/kernel-4.0/doc/DataCite-MetadataKernel_v4.0.pdf#page=32
+}
+
+# DATACITE_UNKNOWN_MARKERS via https://support.datacite.org/docs/schema-values-unknown-information-v43.
+DATACITE_UNKNOWN_MARKERS = (
+    '(:unac)',  # temporarily inaccessible
+    '(:unal)',  # unallowed, suppressed intentionally
+    '(:unap)',  # not applicable, makes no sense
+    '(:unas)',  # value unassigned (e.g., Untitled)
+    '(:unav)',  # value unavailable, possibly unknown
+    '(:unkn)',  # known to be unknown (e.g., Anonymous, Inconnue)
+    '(:none)',  # never had a value, never will
+    '(:null)',  # explicitly and meaningfully empty
+    '(:tba)',  # to be assigned or announced later
+    '(:etal)',  # too numerous to list (et alia)
+)
+
+# UNKNOWN_MARKERS joins official datacite markers with a generic tokens marking
+# unknown values.
+UNKNOWN_MARKERS = set(DATACITE_UNKNOWN_MARKERS).union(set((
+    'NA',
+    'NN',
+    'n.a.',
+    '[s.n.]',
+)))
+
+# TODO(martin): merge this with other maps, maybe.
+LICENSE_SLUG_MAP = {
+    "//creativecommons.org/licenses/by/2.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.de/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.en_US/": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/legalcode/": "CC-BY",
+    "//creativecommons.org/licenses/by-nc/2.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/3.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-sa/4.0/": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nd/4.0/": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
+    "//creativecommons.org/licenses/CC-BY/4.0/": "CC-BY",
+    "//creativecommons.org/licenses/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//opensource.org/licenses/MIT": "MIT",
+    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
+    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
+    "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
+    "//www.opensource.org/licenses/EUPL-1.1":
+    "EUPL-1.1",  # redirects to EUPL-1.2
+    "//www.opensource.org/licenses/MIT": "MIT",
+    # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
+    # "http://rsc.li/journals-terms-of-use": "RSC",
+    # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG.
+    # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "",
+    # "http://www.springer.com/tdm": "",
+    # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html
+    # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "",
+    # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "",
+    # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "",
+    # "https://rightsstatements.org/page/InC/1.0?language=en": "",
+    # "https://services.ceda.ac.uk/cedasite/register/info": "",
+    # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404
+    # "https://www.cambridge.org/core/terms": "",
+    # "https://www.elsevier.com/tdm/userlicense/1.0",
+    # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
+    # "info:eu-repo/semantics/embargoedAccess": "",
+    # "info:eu-repo/semantics/openAccess": "",
+    # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
+}
+
+# TODO(martin): drop this after 3.7 upgrade
+try:
+    isascii = str.isascii # new in 3.7, https://docs.python.org/3/library/stdtypes.html#str.isascii
+except AttributeError:
+    isascii = lambda s: len(s) == len(s.encode())
+
+
+class DataciteImporter(EntityImporter):
+    """
+    Importer for datacite records.
+    """
+    def __init__(self,
+                 api,
+                 issn_map_file,
+                 debug=False,
+                 insert_log_file=None,
+                 **kwargs):
+
+        eg_desc = kwargs.get(
+            'editgroup_description',
+            "Automated import of Datacite DOI metadata, harvested from REST API"
+        )
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent',
+                                         'fatcat_tools.DataciteImporter')
+        super().__init__(api,
+                         issn_map_file=issn_map_file,
+                         editgroup_description=eg_desc,
+                         editgroup_extra=eg_extra,
+                         **kwargs)
+
+        self.create_containers = kwargs.get('create_containers', True)
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map", file=sys.stderr)
+
+        self.read_issn_map_file(issn_map_file)
+        self.debug = debug
+        self.insert_log_file = insert_log_file
+
+        print('datacite with debug={}'.format(self.debug), file=sys.stderr)
+
+    def lookup_ext_ids(self, doi):
+        """
+        Return dictionary of identifiers refering to the same things as the given DOI.
+        """
+        if self.extid_map_db is None:
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
+        row = self.extid_map_db.execute(
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            [doi.lower()]).fetchone()
+        if row is None:
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            pmid=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
+
+    def parse_record(self, obj):
+        """
+        Mapping datacite JSON to ReleaseEntity.
+        """
+        if not obj or not isinstance(obj, dict):
+            return None
+        if 'attributes' not in obj:
+            return None
+
+        attributes = obj['attributes']
+        doi = clean_doi(attributes.get('doi', '').lower())
+
+        if not isascii(doi):
+            print('[{}] skipping non-ascii doi for now'.format(doi))
+            return None
+
+
+        creators = attributes.get('creators', []) or []
+        contributors = attributes.get('contributors', []) or []  # Much fewer than creators.
+
+        contribs = self.parse_datacite_creators(creators, doi=doi) + self.parse_datacite_creators(contributors, role=None, set_index=False, doi=doi)
+
+        # Title, may come with "attributes.titles[].titleType", like
+        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
+        titles = attributes.get('titles', []) or []
+        title, original_language_title, subtitle = parse_datacite_titles(
+            titles)
+
+        if title is None:
+            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+            return False
+
+        title = clean(title)
+        if not title:
+            print('[{}] skipping record w/o title: {}'.format(doi, obj), file=sys.stderr)
+            return False
+
+        if not subtitle:
+            subtitle = None
+        else:
+            subtitle = clean(subtitle)
+
+        # Dates. A few internal dates (registered, created, updated) and
+        # published (0..2554). We try to work with typed date list, in
+        # "attributes.dates[].dateType", values: "Accepted", "Available"
+        # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
+        # "Updated", "Valid".
+        release_date, release_month, release_year = parse_datacite_dates(
+            attributes.get('dates', []))
+
+        # Start with clear stages, e.g. published. TODO(martin): we could
+        # probably infer a bit more from the relations, e.g.
+        # "IsPreviousVersionOf" or "IsNewVersionOf".
+        release_stage = 'published'
+
+        # TODO(martin): If 'state' is not 'findable' or 'isActive' is not true,
+        # we might want something else than 'published'. See also:
+        # https://support.datacite.org/docs/doi-states.
+
+        # Publisher. A few NA values. A few bogus values.
+        publisher = attributes.get('publisher')
+
+        if publisher in UNKNOWN_MARKERS | set(('Unpublished', 'Unknown')):
+            publisher = None
+            release_stage = None
+        if publisher is not None and len(publisher) > 80:
+            # Arbitrary magic value max length. TODO(martin): better heuristic,
+            # but factored out; first we have to log misses. Example:
+            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE /
+            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt
+            # werden"
+            publisher = None
+
+        if publisher:
+            publisher = clean(publisher)
+
+        # Container. For the moment, only ISSN as container.
+        container_id = None
+        container_name = None
+
+        container = attributes.get('container', {}) or {}
+        if container.get('type') in CONTAINER_TYPE_MAP.keys():
+            container_type = CONTAINER_TYPE_MAP.get(container['type'])
+            if container.get('identifier') and container.get(
+                    'identifierType') == 'ISSN':
+                issn = container.get('identifier')
+                if len(issn) == 8:
+                    issn = issn[:4] + "-" + issn[4:]
+                issnl = self.issn2issnl(issn)
+                if issnl is not None:
+                    container_id = self.lookup_issnl(issnl)
+
+                    if container_id is None and container.get('title'):
+                        container_name = container.get('title')
+                        if isinstance(container_name, list):
+                            if len(container_name) > 0:
+                                print('[{}] too many container titles: {}'.format(doi,
+                                    len(container_name)))
+                                container_name = container_name[0]
+                        assert isinstance(container_name, str)
+                        ce = fatcat_openapi_client.ContainerEntity(
+                            issnl=issnl,
+                            container_type=container_type,
+                            name=container_name,
+                        )
+                        ce_edit = self.create_container(ce)
+                        container_id = ce_edit.ident
+                        self._issnl_id_map[issnl] = container_id
+                else:
+                    # TODO(martin): factor this out into a testable function.
+                    # TODO(martin): "container_name": "№1(1) (2018)" / 10.26087/inasan.2018.1.1.013
+                    container_name = container.get('title')
+                    if isinstance(container_name, list):
+                        if len(container_name) > 0:
+                            print('[{}] too many container titles: {}'.format(doi,
+                                len(container_name)))
+                            container_name = container_name[0]
+
+        # Volume and issue.
+        volume = container.get('volume')
+        issue = container.get('issue')
+
+        if volume:
+            volume = clean(volume)
+
+        if issue:
+            issue = clean(issue)
+
+        # Pages.
+        pages = None
+
+        first_page = container.get('firstPage')
+        last_page = container.get('lastPage')
+
+        if first_page and last_page:
+            try:
+                _ = int(first_page) < int(last_page)
+                pages = '{}-{}'.format(first_page, last_page)
+            except ValueError as err:
+                # TODO(martin): This is more debug than info.
+                # print('[{}] {}'.format(doi, err), file=sys.stderr)
+                pass
+
+        if not pages and first_page:
+            pages = first_page
+
+        # License.
+        license_slug = None
+        license_extra = []
+
+        for l in attributes.get('rightsList', []):
+            slug = lookup_license_slug(l.get('rightsUri'))
+            if slug:
+                license_slug = slug
+            license_extra.append(l)
+
+        # Release type. Try to determine the release type from a variety of
+        # types supplied in datacite. The "attributes.types.resourceType" is
+        # uncontrolled (170000+ unique values, from "null", "Dataset" to
+        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP
+        # flows in 2009") citeproc may be the closest, but not always supplied.
+        # Order lookup roughly by completeness of mapping.
+        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'):
+            value = attributes.get('types', {}).get(typeType)
+            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
+            if release_type is not None:
+                break
+
+        if release_type is None:
+            print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr)
+
+        # release_type exception: Global Biodiversity Information Facility
+        # publishes highly interesting datasets, but titles are mostly the same
+        # ("GBIF Occurrence Download" or "Occurrence Download"); set
+        # release_type to "stub" (CSL/FC).
+        if publisher == 'The Global Biodiversity Information Facility':
+            release_type = 'stub'
+
+        # release_type exception: lots of "Experimental Crystal Structure Determination"
+        if publisher == 'Cambridge Crystallographic Data Centre':
+            release_type = 'entry'
+
+        # Supplement files, e.g. "Additional file 1: ASE constructs in questionnaire."
+        if title.lower().startswith('additional file'):
+            release_type = 'stub'
+
+        # Language values are varied ("ger", "es", "English", "ENG", "en-us",
+        # "other", ...). Try to crush it with langcodes: "It may sound to you
+        # like langcodes solves a pretty boring problem. At one level, that's
+        # right. Sometimes you have a boring problem, and it's great when a
+        # library solves it for you." -- TODO(martin): We need more of these.
+        language = None
+
+        value = attributes.get('language', '') or ''
+        try:
+            language = pycountry.languages.lookup(value).alpha_2
+        except (LookupError, AttributeError) as err:
+            pass
+            # TODO(martin): Print this on debug level, only.
+            # print('[{}] language lookup miss for {}: {}'.format(doi, value, err), file=sys.stderr)
+
+        # Abstracts appear in "attributes.descriptions[].descriptionType", some
+        # of the observed values: "Methods", "TechnicalInfo",
+        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The
+        # "Other" fields might contain references or related articles (with
+        # DOI). TODO(martin): maybe try to parse out some of those refs.
+        abstracts = []
+        descs = attributes.get('descriptions', []) or []
+        for desc in descs:
+            if not desc.get('descriptionType') == 'Abstract':
+                continue
+            if len(desc.get('description', '') or '') < 10:
+                continue
+            text = desc.get('description', '')
+            if len(text) > MAX_ABSTRACT_LENGTH:
+                text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
+            lang = None
+            try:
+                lang = langdetect.detect(text)
+            except (langdetect.lang_detect_exception.LangDetectException, TypeError) as err:
+                print('[{}] language detection failed with {} on {}'.format(doi, err, text), file=sys.stderr)
+            abstracts.append(
+                fatcat_openapi_client.ReleaseAbstract(
+                    mimetype="text/plain",
+                    content=clean(text),
+                    lang=lang,
+                ))
+
+        # References and relations. Datacite include many relation types in
+        # "attributes.relatedIdentifiers[].relationType", e.g.
+        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf",
+        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion",
+        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart",
+        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf",
+        # "IsDerivedFrom", "IsSourceOf".
+        #
+        # For the moment, we only care about References.
+        refs, ref_index = [], 0
+
+        relIds = attributes.get('relatedIdentifiers', []) or []
+        for rel in relIds:
+            if not rel.get('relationType', '') in ('References', 'Cites'):
+                continue
+            ref_extra = dict()
+            if rel.get('relatedIdentifierType', '') == 'DOI':
+                ref_extra['doi'] = rel.get('relatedIdentifier')
+            if not ref_extra:
+                ref_extra = None
+            refs.append(
+                fatcat_openapi_client.ReleaseRef(
+                    index=ref_index,
+                    extra=ref_extra,
+                ))
+            ref_index += 1
+
+        # More specific release_type via 'Reviews' relationsship.
+        for rel in relIds:
+            if rel.get('relatedIdentifierType', '') != 'Reviews':
+                continue
+            release_type = 'review'
+
+        # Extra information.
+        extra_datacite = dict()
+
+        if license_extra:
+            extra_datacite['license'] = license_extra
+        if attributes.get('subjects'):
+            extra_datacite['subjects'] = attributes['subjects']
+
+        # Include version information.
+        metadata_version = attributes.get('metadataVersion') or ''
+        schema_version = attributes.get('schemaVersion') or ''
+
+        if metadata_version:
+            extra_datacite['metadataVersion'] = metadata_version
+        if schema_version:
+            extra_datacite['schemaVersion'] = schema_version
+
+        # Include resource types.
+        types = attributes.get('types', {}) or {}
+        resource_type = types.get('resourceType', '') or ''
+        resource_type_general = types.get('resourceTypeGeneral', '') or ''
+
+        if resource_type:
+            extra_datacite['resourceType'] = resource_type
+        if resource_type_general:
+            extra_datacite['resourceTypeGeneral'] = resource_type_general
+
+        # Include certain relations from relatedIdentifiers. Keeping the
+        # original structure of data here, which is a list of dicts, with
+        # relation type, identifer and identifier type (mostly).
+        relations = []
+        for rel in relIds:
+            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
+                                           'IsVariantFormOf', 'IsSupplementTo',
+                                           'HasVersion', 'IsMetadataFor',
+                                           'IsNewVersionOf', 'IsIdenticalTo',
+                                           'IsVersionOf', 'IsDerivedFrom',
+                                           'IsSourceOf'):
+                relations.append(rel)
+
+        if relations:
+            extra_datacite['relations'] = relations
+
+        extra = dict()
+
+        # "1.0.0", "v1.305.2019", "Final", "v1.0.0", "v0.3.0", "1", "0.19.0",
+        # "3.1", "v1.1", "{version}", "4.0", "10329", "11672", "11555",
+        # "v1.4.5", "2", "V1", "v3.0", "v0", "v0.6", "11124", "v1.0-beta", "1st
+        # Edition", "20191024", "v2.0.0", "v0.9.3", "10149", "2.0", null,
+        # "v0.1.1", "3.0", "1.0", "3", "v1.12.2", "20191018", "v0.3.1", "v1.0",
+        # "10161", "10010691", "10780", # "Presentación"
+        version = attributes.get('version')
+
+        # top-level extra keys
+        if not container_id and container_name:
+            extra['container_name'] = container_name
+
+        # Always include datacite key, even if value is empty (dict).
+        extra['datacite'] = extra_datacite
+
+        # Preparation for a schema update.
+        if release_month:
+            extra['release_month'] = release_month
+
+        extids = self.lookup_ext_ids(doi=doi)
+
+        # Assemble release.
+        re = fatcat_openapi_client.ReleaseEntity(
+            work_id=None,
+            container_id=container_id,
+            release_type=release_type,
+            release_stage=release_stage,
+            title=title,
+            subtitle=subtitle,
+            original_title=original_language_title,
+            release_year=release_year,
+            release_date=release_date,
+            publisher=publisher,
+            ext_ids=fatcat_openapi_client.ReleaseExtIds(
+                doi=doi,
+                pmid=extids['pmid'],
+                pmcid=extids['pmcid'],
+                wikidata_qid=extids['wikidata_qid'],
+                core=extids['core_id'],
+                arxiv=extids['arxiv_id'],
+                jstor=extids['jstor_id'],
+            ),
+            contribs=contribs,
+            volume=volume,
+            issue=issue,
+            pages=pages,
+            language=language,
+            abstracts=abstracts,
+            refs=refs,
+            extra=extra,
+            license_slug=license_slug,
+            version=version,
+        )
+        return re
+
+    def try_update(self, re):
+        """
+        When debug is true, write the RE to stdout, not to the database. Might
+        hide schema mismatch bugs.
+        """
+        if self.debug is True:
+            print(json.dumps(entity_to_dict(re, api_client=None)))
+            return False
+
+        # lookup existing DOI (don't need to try other ext idents for crossref)
+        existing = None
+        try:
+            existing = self.api.lookup_release(doi=re.ext_ids.doi)
+        except fatcat_openapi_client.rest.ApiException as err:
+            if err.status != 404:
+                raise err
+            # doesn't exist, need to update
+            return True
+
+        # eventually we'll want to support "updates", but for now just skip if
+        # entity already exists
+        if existing:
+            self.counts['exists'] += 1
+            return False
+
+        return True
+
+    def insert_batch(self, batch):
+        print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+        if self.insert_log_file:
+            with open(self.insert_log_file, 'a') as f:
+                for doc in batch:
+                    json.dump(entity_to_dict(doc, api_client=None), f)
+                    f.write('\n')
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description,
+                    extra=self.editgroup_extra),
+                entity_list=batch))
+
+    def parse_datacite_creators(self, creators, role='author', set_index=True, doi=None):
+        """
+        Parses a list of creators into a list of ReleaseContrib objects. Set
+        set_index to False, if the index contrib field should be left blank.
+        The doi parameter is only used for debugging.
+        """
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
+        contribs = []
+
+        # Names, that should be ignored right away.
+        name_blacklist = set(('Occdownload Gbif.Org',))
+
+        for i, c in enumerate(creators):
+            if not set_index:
+                i = None
+            nameType = c.get('nameType', '') or ''
+            if nameType in ('', 'Personal'):
+                creator_id = None
+                for nid in c.get('nameIdentifiers', []):
+                    name_scheme = nid.get('nameIdentifierScheme', '') or ''
+                    if not name_scheme.lower() == "orcid":
+                        continue
+                    orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+                    if not orcid:
+                        continue
+                    creator_id = self.lookup_orcid(orcid)
+                    # TODO(martin): If creator_id is None, should we create creators?
+
+                # If there are multiple affiliation strings, use the first one.
+                affiliations = c.get('affiliation', []) or []
+                raw_affiliation = None
+                if len(affiliations) == 0:
+                    raw_affiliation = None
+                else:
+                    raw_affiliation = clean(affiliations[0])
+
+                name = c.get('name')
+                given_name = c.get('givenName')
+                surname = c.get('familyName')
+
+                if name:
+                    name = clean(name)
+                if not name:
+                    continue
+                if name in name_blacklist:
+                    continue
+                if name.lower() in UNKNOWN_MARKERS:
+                    continue
+                # Unpack name, if we have an index form (e.g. 'Razis, Panos A') into 'Panos A razis'.
+                if name:
+                    name = index_form_to_display_name(name)
+
+                if given_name:
+                    given_name = clean(given_name)
+                if surname:
+                    surname = clean(surname)
+                if raw_affiliation == '':
+                    continue
+
+                extra = None
+
+                # "DataManager", "DataCurator", "ContactPerson", "Distributor",
+                # "RegistrationAgency", "Sponsor", "Researcher",
+                # "RelatedPerson", "ProjectLeader", "Editor", "Other",
+                # "ProjectMember", "Funder", "RightsHolder", "DataCollector",
+                # "Supervisor", "Producer", "HostingInstitution", "ResearchGroup"
+                contributorType = c.get('contributorType', '') or ''
+
+                if contributorType:
+                    extra = {'type': contributorType}
+
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=i,
+                        raw_name=name,
+                        given_name=given_name,
+                        surname=surname,
+                        role=role,
+                        raw_affiliation=raw_affiliation,
+                        extra=extra,
+                    ))
+            elif nameType == 'Organizational':
+                name = c.get('name', '') or ''
+                if name in UNKNOWN_MARKERS:
+                    continue
+                if len(name) < 3:
+                    continue
+                extra = {'organization': name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(
+                    index=i, extra=extra))
+            else:
+                print('[{}] unknown name type: {}'.format(doi, nameType), file=sys.stderr)
+
+        return contribs
+
+
+def lookup_license_slug(raw):
+    """
+    TODO(martin): reuse from or combine with crossref, maybe.
+    """
+    if not raw:
+        return None
+    raw = raw.strip().replace('http://', '//').replace('https://', '//')
+    if 'creativecommons.org' in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace('/legalcode', '/').replace('/uk', '')
+        if not raw.endswith('/'):
+            raw = raw + '/'
+    return LICENSE_SLUG_MAP.get(raw)
+
+
+def find_original_language_title(item, min_length=4, max_questionmarks=3):
+    """
+    Perform a few checks before returning a potential original language title.
+
+    Example input: {'title': 'Some title', 'original_language_title': 'Some title'}
+    """
+    if not 'original_language_title' in item:
+        return None
+    title = item.get('title')
+    if not title:
+        return None
+    original_language_title = item.get('original_language_title')
+    if isinstance(original_language_title,
+                  str) and title != original_language_title:
+        if len(original_language_title) < min_length:
+            return None
+        if original_language_title.count('?') > max_questionmarks:
+            return None
+        return original_language_title
+    if isinstance(original_language_title, dict):
+        content = original_language_title.get('__content__', '') or ''
+        if content and content != title and not content.count(
+                '?') > max_questionmarks:
+            return content
+    return None
+
+
+def parse_datacite_titles(titles):
+    """
+    Given a list of title items from datacite, return 3-tuple (title,
+    original_language_title, subtitle).
+
+    Example input: [{"title": "Meeting Heterogeneity in Consumer Demand"}]
+    """
+    title, original_language_title, subtitle = None, None, None
+
+    if titles is None:
+        return title, original_language_title, subtitle
+    if len(titles) == 0:
+        return title, original_language_title, subtitle
+    elif len(titles) == 1:
+        original_language_title = find_original_language_title(titles[0])
+        title = titles[0].get('title', '') or ''
+        title = title.strip()
+        if not title:
+            title = None
+        return title, original_language_title, subtitle
+    else:
+        for entry in titles:
+            if not title and ('titleType' not in entry
+                              or not entry.get('titleType')):
+                title = entry.get('title').strip()
+            if not subtitle and entry.get('titleType') == 'Subtitle':
+                subtitle = entry.get('title', '').strip()
+            if not original_language_title:
+                original_language_title = find_original_language_title(entry)
+
+    return title, original_language_title, subtitle
+
+
+def parse_datacite_dates(dates):
+    """
+    Given a list of date fields (under .dates), return tuple, (release_date,
+    release_year).
+    """
+    release_date, release_month, release_year = None, None, None
+
+    if not dates:
+        return release_date, release_month, release_year
+
+    if not isinstance(dates, list):
+        raise ValueError('expected a list of date items')
+
+    # Observed values: "Available", "Submitted", "Valid", "Issued", "Accepted",
+    # "Collected", "Updated", "Copyrighted", "Created"
+    # Ignored for now: "Collected", "Issued"
+    date_type_prio = (
+        'Valid',
+        'Available',
+        'Accepted',
+        'Submitted',
+        'Copyrighted',
+        'Created',
+        'Updated',
+    )
+
+    # We need to note the granularity, since a string like "2019" would be
+    # parsed into "2019-01-01", even though the month is unknown. Use 3
+    # granularity types: 'y', 'm', 'd'.
+    Pattern = collections.namedtuple('Pattern', 'layout granularity')
+
+    # Before using (expensive) dateparser, try a few common patterns.
+    common_patterns = (
+        Pattern('%Y-%m-%d', 'd'),
+        Pattern('%Y-%m', 'm'),
+        Pattern('%Y-%m-%dT%H:%M:%SZ', 'd'),
+        Pattern('%Y-%m-%dT%H:%M:%S', 'd'),
+        Pattern('%Y', 'y'),
+    )
+
+    def parse_item(item):
+        result, value, year_only = None, item.get('date', ''), False
+        release_date, release_month, release_year = None, None, None
+
+        for layout, granularity in common_patterns:
+            try:
+                result = datetime.datetime.strptime(value, layout)
+            except ValueError:
+                continue
+            else:
+                if granularity == 'y':
+                    year_only = True
+                break
+
+        if result is None:
+            print('fallback for {}'.format(value), file=sys.stderr)
+            parser = dateparser.DateDataParser()
+            try:
+                # Results in a dict with keys: date_obj, period, locale.
+                parse_result = parser.get_date_data(value)
+
+                # A datetime object, later we need a date, only.
+                result = parse_result['date_obj']
+                if result is not None:
+                    if parse_result['period'] == 'year':
+                        return None, None, result.year
+                    elif parse_result['period'] == 'month':
+                        return None, result.month, result.year
+                    else:
+                        return result.date(), result.month, result.year
+            except TypeError as err:
+                print("{} date parsing failed with: {}".format(value, err),
+                      file=sys.stderr)
+
+        if result is None:
+            # Unparsable date.
+            return release_date, release_month, release_year
+
+        if granularity != 'y':
+            release_date = result.date()
+        release_year = result.year
+        if granularity in ('m', 'd'):
+            release_month = result.month
+
+        return release_date, release_month, release_year
+
+    today = datetime.date.today()
+
+    for prio in date_type_prio:
+        for item in dates:
+            if not item.get('dateType') == prio:
+                continue
+
+            release_date, release_month, release_year = parse_item(item)
+            if release_date is None and release_year is None:
+                continue
+
+            if release_year < 1000 or release_year > today.year + 5:
+                # Skip possibly bogus dates.
+                release_year = None
+                continue
+            break
+        else:
+            continue
+        break
+
+    if release_date is None and release_year is None:
+        for item in dates:
+            release_date, release_month, release_year = parse_item(item)
+            if release_year or release_date:
+                break
+
+    return release_date, release_month, release_year
+
+def index_form_to_display_name(s):
+    """
+    Try to convert an index form name, like 'Razis, Panos A' into display_name,
+    e.g. 'Panos A Razis'.
+    """
+    if ',' not in s:
+        return s
+    skip_on_chars = ['(', ')', '*']
+    for char in skip_on_chars:
+        if char in s:
+            return s
+    if s.count(',') > 1:
+        # "Dr. Hina, Dr. Muhammad Usman Shahid, Dr. Muhammad Zeeshan Khan"
+        return s
+
+    # Not names, but sprinkled in fields where authors live.
+    stopwords = [s.lower() for s in (
+        'Archive',
+        'Collection',
+        'Coordinator',
+        'Department',
+        'Germany',
+        'International',
+        'National',
+        'Netherlands',
+        'Office',
+        'Organisation',
+        'Organization',
+        'Service',
+        'Services',
+        'United States',
+        'University',
+        'Verein',
+        'Volkshochschule',
+    )]
+    lower = s.lower()
+    for stop in stopwords:
+        if stop in lower:
+            return s
+
+    a, b = s.split(',')
+    return '{} {}'.format(b.strip(), a.strip())
author	Martin Czygan <martin.czygan@gmail.com>	2020-01-08 23:31:40 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-01-08 23:31:40 +0100
commit	081746837a55bf5f34c96f12f1abb5a00d5b478c (patch)
tree	88af1ade558ad6695918d36648b3ed4a5bea6954 /python/fatcat_tools/importers
parent	27723a61bde5591bae8115d801d0d09b7ef01b03 (diff)
parent	277bd183d7139bb1a8857bc2a48c0aa92012455d (diff)
download	fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.tar.gz fatcat-081746837a55bf5f34c96f12f1abb5a00d5b478c.zip