datacite: importer skeleton

* contributors, title, date, publisher, container, license Field and value analysis via https://github.com/miku/indigo.
author: Martin Czygan <martin.czygan@gmail.com> 2019-12-09 01:03:43 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2019-12-28 23:07:31 +0100
commit: 4a82a0763bf927248f22e47ab5187af4beff83ee (patch)
tree: af86801bfb77a40bc8b409fa736b40c581fe970c
parent: 54a2c83c0a5e8ccd4eec7c18eac715bdbb3eb62e (diff)
download: fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.tar.gz
fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.zip
4 files changed, 514 insertions, 0 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 8d82dab3..d7651792 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,17 @@ def run_cdl_dash_dat(args):
     print("fileset id: {}".format(fs.ident))
     print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
 
+def run_datacite(args):
+    dci = DataciteImporter(args.api,
+        args.issn_map_file,
+        edit_batch_size=args.batch_size,
+        bezerk_mode=args.bezerk_mode)
+    if args.kafka_mode:
+        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
+            "fatcat-import", consume_batch_size=args.batch_size).run()
+    else:
+        JsonLinePusher(dci, args.json_file).run()
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -439,6 +450,25 @@ def main():
         type=str,
         help="use existing editgroup (instead of creating a new one)")
 
+    sub_datacite = subparsers.add_parser('datacite',
+        help="import datacite.org metadata")
+    sub_datacite.add_argument('json_file',
+        help="File with jsonlines from datacite.org v2 API to import from",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_datacite.add_argument('issn_map_file',
+        help="ISSN to ISSN-L mapping file",
+        default=None, type=argparse.FileType('r'))
+    sub_datacite.add_argument('--kafka-mode',
+        action='store_true',
+        help="consume from kafka topic (not stdin)")
+    sub_datacite.add_argument('--bezerk-mode',
+        action='store_true',
+        help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.set_defaults(
+        func=run_datacite,
+        auth_var="FATCAT_API_AUTH_TOKEN",
+    )
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print("tell me what to do!")
diff --git a/python/fatcat_tools/importers/__init__.py b/python/fatcat_tools/importers/__init__.py
index bb9c5b17..d936605f 100644
--- a/python/fatcat_tools/importers/__init__.py
+++ b/python/fatcat_tools/importers/__init__.py
@@ -14,6 +14,7 @@ To run an import you combine two classes; one each of:
 
 from .common import EntityImporter, JsonLinePusher, LinePusher, CsvPusher, SqlitePusher, Bs4XmlFilePusher, Bs4XmlLargeFilePusher, Bs4XmlLinesPusher, Bs4XmlFileListPusher, KafkaJsonPusher, make_kafka_consumer, clean, is_cjk, LANG_MAP_MARC
 from .crossref import CrossrefImporter, CROSSREF_TYPE_MAP, lookup_license_slug
+from .datacite import DataciteImporter
 from .jalc import JalcImporter
 from .jstor import JstorImporter
 from .arxiv import ArxivRawImporter
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
new file mode 100644
index 00000000..faa8e2be
--- /dev/null
+++ b/python/fatcat_tools/importers/datacite.py
@@ -0,0 +1,458 @@
+"""
+WIP: Importer for datacite.org data.
+
+Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
+"""
+
+from .common import EntityImporter
+import dateparser
+import datetime
+import fatcat_openapi_client
+import json
+import sys
+
+# https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
+CONTAINER_TYPE_MAP = {
+    'Journal': 'journal',
+    'Series': 'journal',
+    'Book Series': 'book-series',
+}
+
+# TODO(martin): merge this with other maps, maybe.
+LICENSE_SLUG_MAP = {
+    "//creativecommons.org/licenses/by/2.0": "CC-BY",
+    "//creativecommons.org/licenses/by/2.0/uk/legalcode": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0": "CC-BY",
+    "//creativecommons.org/licenses/by/3.0/us": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.de": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/deed.en_US": "CC-BY",
+    "//creativecommons.org/licenses/by/4.0/legalcode": "CC-BY",
+    "//creativecommons.org/licenses/by-nc/2.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/3.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc/4.0/legalcode": "CC-BY-NC",
+    "//creativecommons.org/licenses/by-nc-nd/3.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/3.0/gr": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-NC-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-nd/4.0/legalcode": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-NC-SA",
+    "//creativecommons.org/licenses/by-nc-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-nd/4.0": "CC-BY-ND",
+    "//creativecommons.org/licenses/by-sa/3.0/de": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/3.0/gr": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0": "CC-BY-SA",
+    "//creativecommons.org/licenses/by-sa/4.0/legalcode": "CC-BY-SA",
+    "//creativecommons.org/licenses/CC-BY/4.0": "CC-BY",
+    "//creativecommons.org/licenses/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0": "CC-0",
+    "//creativecommons.org/publicdomain/zero/1.0/legalcode": "CC-0",
+    "//opensource.org/licenses/MIT": "MIT",
+    "//www.elsevier.com/open-access/userlicense/1.0": "ELSEVIER-USER-1.0",
+    "//www.gnu.org/licenses/gpl-3.0.en.html": "GPLv3",
+    "//www.gnu.org/licenses/old-licenses/gpl-2.0.en.html": "GPLv2",
+    "//www.karger.com/Services/SiteLicenses": "KARGER",
+    "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
+    "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
+    "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
+    "//www.opensource.org/licenses/MIT": "MIT",
+    # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
+    # "http://rsc.li/journals-terms-of-use": "RSC",
+    # "http://www.fu-berlin.de/sites/refubium/rechtliches/Nutzungsbedingungen": "", # 53 UrhG.
+    # "http://www.nrcresearchpress.com/page/about/CorporateTextAndDataMining": "",
+    # "http://www.springer.com/tdm": "",
+    # "https://cds.unistra.fr/vizier-org/licences_vizier.html": "", # Maybe try to "SPN" those: https://web.archive.org/web/*/https://cds.unistra.fr/vizier-org/licences_vizier.html
+    # "https://link.aps.org/licenses/aps-default-accepted-manuscript-license": "",
+    # "https://oparu.uni-ulm.de/xmlui/license_opod_v1": "",
+    # "https://publikationen.bibliothek.kit.edu/kitopen-lizenz": "",
+    # "https://rightsstatements.org/page/InC/1.0?language=en": "",
+    # "https://services.ceda.ac.uk/cedasite/register/info": "",
+    # "https://wdc.dlr.de/ndmc/userfiles/file/NDMC-Data_Sharing_Principles.pdf": "", # 404
+    # "https://www.cambridge.org/core/terms": "",
+    # "https://www.elsevier.com/tdm/userlicense/1.0",
+    # "info:eu-repo/semantics/closedAccess": "", # https://wiki.surfnet.nl/display/standards/info-eu-repo/#info-eu-repo-AccessRights
+    # "info:eu-repo/semantics/embargoedAccess": "",
+    # "info:eu-repo/semantics/openAccess": "",
+}
+
+class DataciteImporter(EntityImporter):
+    """
+    Importer for datacite records. TODO(martin): Do we need issn_map_file?
+    """
+
+    def __init__(self, api, issn_map_file, **kwargs):
+
+        eg_desc = kwargs.get('editgroup_description',
+            "Automated import of Datacite DOI metadata, harvested from REST API")
+        eg_extra = kwargs.get('editgroup_extra', dict())
+        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
+        super().__init__(api,
+            issn_map_file=issn_map_file,
+            editgroup_description=eg_desc,
+            editgroup_extra=eg_extra,
+            **kwargs)
+
+        self.create_containers = kwargs.get('create_containers', True)
+        self.read_issn_map_file(issn_map_file)
+
+    def parse_record(self, obj):
+        """
+        TODO(martin): Map datacite to RE.
+
+        WIP, notes:
+
+        * Many subjects, should they end up in extra?
+        * attributes.creators and attributes.contributors
+
+        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr
+        3963663 "Personal"
+        289795 null
+        8892 "Organizational"
+
+        Shall we use issued, available?
+
+          {
+            "date": "2011-11-18",
+            "dateType": "Accepted"
+          },
+          {
+            "date": "2011-11-18",
+            "dateType": "Available"
+          },
+          {
+            "date": "2011-11-07",
+            "dateType": "Copyrighted"
+          },
+          {
+            "date": "2011-11-18",
+            "dateType": "Issued"
+          },
+          {
+            "date": "2011-11-07",
+            "dateType": "Issued"
+          }
+
+        TODO(martin): Quick analysis of dates and stages.
+        """
+
+        if 'attributes' not in obj:
+            return None
+
+        attributes = obj['attributes']
+
+        # > Contributors
+        #
+        #  "attributes.creators[].contributorType": [
+        #    "author"
+        #  ],
+        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
+        #    "LCNA",
+        #    "GND",
+        #    "email",
+        #    "NAF",
+        #    "OSF",
+        #    "RRID",
+        #    "ORCID",
+        #    "SCOPUS",
+        #    "NRCPID",
+        #    "schema.org",
+        #    "GRID",
+        #    "MGDS",
+        #    "VIAF",
+        #    "JACoW-ID"
+        #  ],
+        #
+        #    "https://orcid.org/0000-0002-9902-738X",
+        #    "http://jacow.org/JACoW-00001280",
+        #    "Wiebe_Peter",
+        #    "https://osf.io/https://osf.io/kjfuy/",
+        #    "http://www.viaf.org176549220",
+        #    "2239",
+        #    "Jeffries_Martin",
+        #    "https://orcid.org/0000-0002-1493-6630",
+        #    "0000-0002-6233-612X",
+        #
+        # "creators": [
+        #   {
+        #     "name": "Bögli, Hans",
+        #     "nameType": "Personal",
+        #     "givenName": "Hans",
+        #     "familyName": "Bögli",
+        #     "affiliation": []
+        #   }
+        # ],
+
+        contribs = []
+
+        for i, c in enumerate(attributes['creators']):
+            if not c.get('nameType') == 'Personal':
+                continue
+            creator_id = None
+            for nid in c.get('nameIdentifiers', []):
+                if not nid.get('nameIdentifierScheme').lower() == "orcid":
+                    continue
+                orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
+                if not orcid:
+                    continue
+                creator_id = self.lookup_orcid(orcid)
+                # If creator_id is None, should we create creators?
+            contribs.append(fatcat_openapi_client.ReleaseContrib(
+                creator_id=creator_id,
+                index=i,
+                raw_name=c.get('name'),
+                given_name=c.get('givenName'),
+                surname=c.get('familyName'),
+            ))
+
+        # > Title
+        #
+        #   "attributes.titles[].titleType": [
+        #     "AlternativeTitle",
+        #     "Other",
+        #     "Subtitle",
+        #     null,
+        #     "TranslatedTitle"
+        #   ],
+        title, subtitle = None, None
+
+        for entry in attributes.get('titles', []):
+            if not title and 'titleType' not in entry:
+                title = entry.get('title').strip()
+            if entry.get('titleType') == 'Subtitle':
+                subtitle = entry.get('title').strip()
+
+        # > Dates
+        #
+        #  "attributes.dates[].dateType": [
+        #    "Accepted",
+        #    "Available"
+        #    "Collected",
+        #    "Copyrighted",
+        #    "Created",
+        #    "Issued",
+        #    "Submitted",
+        #    "Updated",
+        #    "Valid",
+        #  ],
+        #
+        # Different documents have different dates defined. Choose the topmost
+        # available from prio list.
+        date_type_prio = (
+            'Valid',
+            'Issued',
+            'Available',
+            'Accepted',
+            'Submitted',
+            'Copyrighted',
+            'Collected',
+            'Created',
+            'Updated',
+        )
+
+        release_year, release_date = None, None
+        for prio in date_type_prio:
+            dates = attributes.get('dates', []) or [] # Never be None.
+            for item in dates:
+                if not item.get('dateType') == prio:
+                    continue
+                result = dateparser.parse(item.get('date'))
+                if result is None:
+                    # Unparsable date.
+                    continue
+                release_date = result
+                release_year = result.year
+                if 1000 < release_year < datetime.date.today().year + 5:
+                    # Skip possibly bogus dates.
+                    continue
+                break
+            else:
+                continue
+            break
+
+        # > Publisher
+        #
+        # A few NA values. A few bogus values.
+        #
+        publisher = attributes.get('publisher')
+
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'):
+            publisher = None
+        if publisher is not None and len(publisher) > 80:
+            # Arbitrary magic value, TODO(martin): better heuristic.
+            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller,
+            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt,
+            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden",
+            # TODO(martin): log misses.
+            publisher = None
+
+        # > Container
+        #
+        # For the moment, only ISSN as container.
+        #
+        #    "container": {
+        #      "type": "Journal",
+        #      "issue": "8",
+        #      "title": "Angewandte Chemie International Edition",
+        #      "volume": "57",
+        #      "lastPage": "2080",
+        #      "firstPage": "2077",
+        #      "identifier": "14337851",
+        #      "identifierType": "ISSN"
+        #    },
+        #
+        # "attributes.container.type": [
+        #   "DataRepository",
+        #   "Journal",
+        #   "Series",
+        #   "Book Series"
+        # ],
+        #
+        #  "attributes.container.identifierType": [
+        #    "Handle",
+        #    "ISBN",
+        #    "LISSN",
+        #    "DOI",
+        #    "EISSN",
+        #    "URL",
+        #    "ISSN"
+        #  ],
+
+        container_id = None
+        container = attributes.get('container', {}) or {}
+        if container.get('type') in CONTAINER_TYPE_MAP.keys():
+            container_type = CONTAINER_TYPE_MAP.get(container['type'])
+            if container.get('identifier') and container.get('identifierType') == 'ISSN':
+                issn = container.get('identifier')
+                if len(issn) == 8:
+                    issn = issn[:4] + "-" + issn[4:]
+                issnl = self.issn2issnl(issn)
+                container_id = self.lookup_issnl(issnl)
+
+                if container_id is None and container.get('title'):
+                    ce = fatcat_openapi_client.ContainerEntity(
+                        issnl=issnl,
+                        container_type=container_type,
+                        name=container.get('title'),
+                    )
+                    ce_edit = self.create_container(ce)
+                    container_id = ce_edit.ident
+                    self._issnl_id_map[issnl] = container_id
+
+        # > License
+        #
+        # attributes.rightsList[].rightsUri
+        # attributes.rightsList[].rights
+        # attributes.rightsList[].lang
+        #
+
+        license_slug = None
+        license_extra = []
+        for l in attributes.get('rightsList', []):
+            slug = lookup_license_slug(l.get('rightsUri'))
+            if slug:
+                license_slug = slug
+            license_extra.append(l)
+
+        # > Release type.
+        #
+        # Datacite has some fine granular typing (e.g. "Supplementary
+        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ...
+        #
+        # Additional, coarse: resourceTypeGeneral
+        #
+        #  "attributes.types.resourceTypeGeneral": [
+        #    "Image",
+        #    "Dataset",
+        #    "PhysicalObject",
+        #    "Collection",
+        #    "Text",
+        #    "Sound",
+        #    "InteractiveResource",
+        #    "Event",
+        #    "Software",
+        #    "Other",
+        #    "Workflow",
+        #    "Audiovisual"
+        #  ],
+
+        # > Extra information.
+        extra, extra_datacite = dict(), dict()
+        if license_extra:
+            extra_datacite['license'] = license_extra
+
+        if extra_datacite:
+            extra['datacite'] = extra_datacite
+
+        # https://guide.fatcat.wiki/entity_release.html
+        re = fatcat_openapi_client.ReleaseEntity(
+            work_id=None,
+            container_id=container_id,
+            release_type=None,
+            release_stage=None,
+            title=title, # attributes.titles, various titleType
+            subtitle=subtitle,
+            original_title=title, # AlternativeTitle?
+            release_year=release_year, # publicationYear
+            release_date=release_date, # date issues/available?
+            publisher=publisher, # attributes.publisher
+            ext_ids=fatcat_openapi_client.ReleaseExtIds(
+                doi=attributes.get('doi'), # attributes.doi,
+                # Can we add handle.net link?
+            ),
+            contribs=contribs,
+            volume=None,
+            issue=None,
+            pages=None,
+            language=None,
+            abstracts=None,
+            refs=None,
+            extra=extra,
+            license_slug=license_slug,
+        )
+        return re
+
+    def try_update(self, re, debug=True):
+        if debug is True:
+            # print(type(re))
+            print(json.dumps(re.to_dict(), default=extended_encoder))
+            return
+        return False
+
+    def insert_batch(self, batch):
+        # Debugging.
+        for item in batch:
+            print(item)
+        return
+
+        # Orig.
+        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
+            editgroup=fatcat_openapi_client.Editgroup(
+                description=self.editgroup_description,
+                extra=self.editgroup_extra),
+            entity_list=batch))
+
+def extended_encoder(value):
+    """
+    Can be used with json.dumps(value, default=extended_encoder) to serialize
+    value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
+    """
+    if isinstance(value, (datetime.datetime, datetime.date)):
+        return value.isoformat()
+    if isinstance(value, set):
+        return list(value)
+
+def lookup_license_slug(raw):
+    """
+    TODO(martin): reuse from crossref, maybe.
+    """
+    if not raw:
+        return None
+    raw = raw.strip().replace('http://', '//').replace('https://', '//')
+    if 'creativecommons.org' in raw.lower():
+        raw = raw.lower()
+        raw = raw.replace('/legalcode', '/').replace('/uk', '')
+        if not raw.endswith('/'):
+            raw = raw + '/'
+    return LICENSE_SLUG_MAP.get(raw)
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
new file mode 100644
index 00000000..0bbaba2e
--- /dev/null
+++ b/python/tests/import_datacite.py
@@ -0,0 +1,25 @@
+"""
+Test datacite importer.
+
+Datacite is a aggregator, hence inputs are quite varied.
+
+Here is small sample of ID types taken from a sample:
+
+    497344 "DOI"
+     65013 "URL"
+     22210 "CCDC"
+     17853 "GBIF"
+     17635 "Other"
+     11474 "uri"
+      9170 "Publisher ID"
+      7775 "URN"
+      6196 "DUCHAS"
+      5624 "Handle"
+      5056 "publisherId"
+
+A nice tool, not yet existing tool (maybe named indigo) would do the following:
+
+    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
+
+TODO(martin): Write tests.
+"""
author	Martin Czygan <martin.czygan@gmail.com>	2019-12-09 01:03:43 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2019-12-28 23:07:31 +0100
commit	4a82a0763bf927248f22e47ab5187af4beff83ee (patch)
tree	af86801bfb77a40bc8b409fa736b40c581fe970c
parent	54a2c83c0a5e8ccd4eec7c18eac715bdbb3eb62e (diff)
download	fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.tar.gz fatcat-4a82a0763bf927248f22e47ab5187af4beff83ee.zip