From 4a82a0763bf927248f22e47ab5187af4beff83ee Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 9 Dec 2019 01:03:43 +0100
Subject: datacite: importer skeleton

* contributors, title, date, publisher, container, license

Field and value analysis via https://github.com/miku/indigo.
---
 python/fatcat_import.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 8d82dab3..d7651792 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -166,6 +166,17 @@ def run_cdl_dash_dat(args):
     print("fileset id: {}".format(fs.ident))
     print("link: https://fatcat.wiki/fileset/{}".format(fs.ident))
 
+def run_datacite(args):
+    dci = DataciteImporter(args.api,
+        args.issn_map_file,
+        edit_batch_size=args.batch_size,
+        bezerk_mode=args.bezerk_mode)
+    if args.kafka_mode:
+        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
+            "fatcat-import", consume_batch_size=args.batch_size).run()
+    else:
+        JsonLinePusher(dci, args.json_file).run()
+
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -439,6 +450,25 @@ def main():
         type=str,
         help="use existing editgroup (instead of creating a new one)")
 
+    sub_datacite = subparsers.add_parser('datacite',
+        help="import datacite.org metadata")
+    sub_datacite.add_argument('json_file',
+        help="File with jsonlines from datacite.org v2 API to import from",
+        default=sys.stdin, type=argparse.FileType('r'))
+    sub_datacite.add_argument('issn_map_file',
+        help="ISSN to ISSN-L mapping file",
+        default=None, type=argparse.FileType('r'))
+    sub_datacite.add_argument('--kafka-mode',
+        action='store_true',
+        help="consume from kafka topic (not stdin)")
+    sub_datacite.add_argument('--bezerk-mode',
+        action='store_true',
+        help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.set_defaults(
+        func=run_datacite,
+        auth_var="FATCAT_API_AUTH_TOKEN",
+    )
+
     args = parser.parse_args()
     if not args.__dict__.get("func"):
         print("tell me what to do!")
-- 
cgit v1.2.3


From 403b1a2d4591d878145a021a7c1e15e2d60c47d8 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 18 Dec 2019 20:21:49 +0100
Subject: improve datacite field mapping and import

Current version succeeded to import a random sample of 100000 records
(0.5%) from datacite.

The --debug (write JSON to stdout) and --insert-log-file (log batch
before committing to db) flags are temporary added to help debugging.

Add few unit tests.

Some edge cases:

a) Existing keys without value requires a slightly awkward:

```
titles = attributes.get('titles', []) or []
```

b) There can be 0, 1, or more (first one wins) titles.

c) Date handling is probably not ideal. Datacite has a potentiall fine
grained list of dates.

The test case (tests/files/datacite_sample.jsonl) refers to
https://ssl.fao.org/glis/doi/10.18730/8DYM9, which has date (main
descriptor) 1986. The datacite record contains: 2017 (publicationYear,
probably the year of record creation with reference system), 1978-06-03
(collected, e.g. experimental sample), 1986 ("Accepted"). The online
version of the resource knows even one more date (2019-06-05 10:14:43 by
WIEWS update).
---
 python/fatcat_import.py                         |  15 +-
 python/fatcat_tools/importers/datacite.py       | 180 ++++++++++++++++++------
 python/tests/files/datacite_1k_records.jsonl.gz | Bin 0 -> 684605 bytes
 python/tests/files/datacite_sample.jsonl        |   1 +
 python/tests/import_datacite.py                 | 108 +++++++++++---
 5 files changed, 245 insertions(+), 59 deletions(-)
 create mode 100644 python/tests/files/datacite_1k_records.jsonl.gz
 create mode 100644 python/tests/files/datacite_sample.jsonl

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index d7651792..90bb01a1 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -170,7 +170,10 @@ def run_datacite(args):
     dci = DataciteImporter(args.api,
         args.issn_map_file,
         edit_batch_size=args.batch_size,
-        bezerk_mode=args.bezerk_mode)
+        bezerk_mode=args.bezerk_mode,
+        debug=args.debug,
+        lang_detect=args.lang_detect,
+        insert_log_file=args.insert_log_file)
     if args.kafka_mode:
         KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
             "fatcat-import", consume_batch_size=args.batch_size).run()
@@ -464,6 +467,16 @@ def main():
     sub_datacite.add_argument('--bezerk-mode',
         action='store_true',
         help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.add_argument('--debug',
+        action='store_true',
+        help="write converted JSON to stdout")
+    sub_datacite.add_argument('--lang-detect',
+        action='store_true',
+        help="try to detect language (slow)")
+    sub_datacite.add_argument('--insert-log-file',
+        default='',
+        type=str,
+        help="write inserted documents into file (for debugging)")
     sub_datacite.set_defaults(
         func=run_datacite,
         auth_var="FATCAT_API_AUTH_TOKEN",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4e117dde..9774e334 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 
 from .common import EntityImporter
 import dateparser
-import langcodes
 import datetime
-import langdetect
 import fatcat_openapi_client
+import hashlib
 import json
+import langcodes
+import langdetect
+import sqlite3
 import sys
-import hashlib
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = {
 
 class DataciteImporter(EntityImporter):
     """
-    Importer for datacite records. TODO(martin): Do we need issn_map_file?
+    Importer for datacite records.
     """
 
-    def __init__(self, api, issn_map_file, **kwargs):
+    def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
+                 insert_log_file=None, **kwargs):
 
         eg_desc = kwargs.get('editgroup_description',
             "Automated import of Datacite DOI metadata, harvested from REST API")
@@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter):
             **kwargs)
 
         self.create_containers = kwargs.get('create_containers', True)
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map", file=sys.stderr)
+
         self.read_issn_map_file(issn_map_file)
+        self.debug = debug
+        self.lang_detect = lang_detect
+        self.insert_log_file = insert_log_file
+
+        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+
+    def lookup_ext_ids(self, doi):
+        """
+        Return dictionary of identifiers refering to the same things as the given DOI.
+        """
+        if self.extid_map_db is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            [doi.lower()]).fetchone()
+        if row is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            pmid=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def parse_record(self, obj):
         """
@@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter):
 
         attributes = obj['attributes']
 
-        # Contributors. Many nameIdentifierSchemes, we do not use yet:
-        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
-        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
-        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
-            if not c.get('nameType') == 'Personal':
+            if 'nameType' in c and not c.get('nameType') == 'Personal':
                 continue
             creator_id = None
             for nid in c.get('nameIdentifiers', []):
@@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter):
                 if not orcid:
                     continue
                 creator_id = self.lookup_orcid(orcid)
-                # If creator_id is None, should we create creators?
+                # TODO(martin): If creator_id is None, should we create creators?
             contribs.append(fatcat_openapi_client.ReleaseContrib(
                 creator_id=creator_id,
                 index=i,
@@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter):
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         title, subtitle = None, None
 
-        for entry in attributes.get('titles', []):
-            if not title and 'titleType' not in entry:
-                title = entry.get('title').strip()
-            if entry.get('titleType') == 'Subtitle':
-                subtitle = entry.get('title').strip()
+        titles = attributes.get('titles', []) or []
+        if len(titles) == 0:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+        elif len(titles) == 1:
+            # We do not care about the type then.
+            title = titles[0].get('title', '') or ''
+            title = title.strip()
+        else:
+            for entry in titles:
+                if not title and ('titleType' not in entry or not entry.get('titleType')):
+                    title = entry.get('title').strip()
+                if entry.get('titleType') == 'Subtitle':
+                    subtitle = entry.get('title', '').strip()
+
+        if not title:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+
+        if not subtitle:
+            subtitle = None
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter):
         # "Updated", "Valid".
         release_year, release_date = None, None
 
+        # Ignore: Collected, Issued.
         date_type_prio = (
             'Valid',
-            'Issued',
             'Available',
             'Accepted',
             'Submitted',
             'Copyrighted',
-            'Collected',
             'Created',
             'Updated',
         )
@@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter):
             for item in dates:
                 if not item.get('dateType') == prio:
                     continue
-                try:
-                    result = dateparser.parse(item.get('date'))
-                except TypeError as err:
-                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
-                    continue
+
+                # Parse out date, use common patterns first, fallback to dateparser.
+                result, value, year_only = None, item.get('date', ''), False
+
+                # Before using (expensive) dateparser, try a few common patterns.
+                common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S')
+
+                for pattern in common_patterns:
+                    try:
+                        result = datetime.datetime.strptime(value, pattern)
+                    except ValueError:
+                        continue
+                    else:
+                        if pattern == '%Y':
+                            year_only = True
+                        break
+
+                if result is None:
+                    print('fallback for {}'.format(value), file=sys.stderr)
+                    try:
+                        result = dateparser.parse(value)
+                    except TypeError as err:
+                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+                        continue
+
                 if result is None:
                     # Unparsable date.
                     continue
-                release_date = result
+                if not year_only:
+                    release_date = result.date()
                 release_year = result.year
                 if 1000 < release_year < datetime.date.today().year + 5:
                     # Skip possibly bogus dates.
@@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter):
                     container_id = self.lookup_issnl(issnl)
 
                     if container_id is None and container.get('title'):
+                        container_title = container.get('title')
+                        if isinstance(container_title, list):
+                            if len(container_title) > 0:
+                                print('too many container titles: {}'.format(len(container_title)))
+                                container_title = container_title[0]
+                        assert isinstance(container_title, str)
                         ce = fatcat_openapi_client.ContainerEntity(
                             issnl=issnl,
                             container_type=container_type,
-                            name=container.get('title'),
+                            name=container_title,
                         )
                         ce_edit = self.create_container(ce)
                         container_id = ce_edit.ident
@@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter):
         # closest, but not always supplied.
         for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
             value = attributes.get('types', {}).get(typeType)
-            release_type = DATACITE_TYPE_MAP.get(value)
+            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
                 break
 
         if release_type is None:
-            print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+            print("no mapped type: {}".format(value), file=sys.stderr)
 
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter):
             try:
                 language = langcodes.get(value).language
             except langcodes.tag_parser.LanguageTagError:
-                print('could not determine language: {}'.format(value), file=sys.stderr)
+                pass
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
@@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter):
         # "Other" fields might contain references or related articles (with
         # DOI). TODO(martin): maybe try to parse out some of those refs.
         abstracts = []
-
-        for desc in attributes.get('descriptions', []):
+        descs = attributes.get('descriptions', []) or []
+        for desc in descs:
             if not desc.get('descriptionType') == 'Abstract':
                 continue
             if len(desc.get('description', '')) < 10:
@@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter):
             text = desc.get('description')
             sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
             lang = None
-            try:
-                lang = langdetect.detect(text)
-            except langdetect.lang_detect_exception.LangDetectException:
-                pass
+            if self.lang_detect:
+                try:
+                    lang = langdetect.detect(text)
+                except langdetect.lang_detect_exception.LangDetectException as err:
+                    print('language detection failed: {}'.format(err), file=sys.stderr)
             abstracts.append(fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain",
                 content=text,
@@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter):
         # For the moment, we only care about References.
         refs, ref_index = [], 0
 
-        for rel in attributes.get('relatedIdentifiers', []):
+        relIds = attributes.get('relatedIdentifiers', []) or []
+        for rel in relIds:
             if not rel.get('relationType') == 'References':
                 continue
             ref_extra = dict()
@@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter):
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
+        doi = attributes.get('doi', '').lower()
+        extids = self.lookup_ext_ids(doi=doi)
+
         # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
@@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter):
             release_date=release_date,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=attributes.get('doi'),
+                doi=doi,
+                pmid=extids['pmid'],
+                pmcid=extids['pmcid'],
+                wikidata_qid=extids['wikidata_qid'],
+                core=extids['core_id'],
+                arxiv=extids['arxiv_id'],
+                jstor=extids['jstor_id'],
             ),
             contribs=contribs,
             volume=volume,
@@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter):
         )
         return re
 
-    def try_update(self, re, debug=True):
+    def try_update(self, re):
         """
-        When debug is true, write the RE to stdout.
+        When debug is true, write the RE to stdout, not to the database. Might
+        hide schema mismatch bugs.
         """
-        if debug is True:
+        if self.debug is True:
             print(json.dumps(re.to_dict(), default=extended_json_encoder))
             return False
 
@@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter):
         return True
 
     def insert_batch(self, batch):
+        print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+        if self.insert_log_file:
+            with open(self.insert_log_file, 'a') as f:
+                for doc in batch:
+                    json.dump(doc.to_dict(), f, default=extended_json_encoder)
+                    f.write('\n')
         self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
+            description=self.editgroup_description,
+            extra=self.editgroup_extra),
             entity_list=batch))
 
 def extended_json_encoder(value):
@@ -491,6 +588,7 @@ def extended_json_encoder(value):
         return value.isoformat()
     if isinstance(value, set):
         return list(value)
+    raise TypeError('cannot encode type: {}'.format(type(value)))
 
 def lookup_license_slug(raw):
     """
diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz
new file mode 100644
index 00000000..28ea6e37
Binary files /dev/null and b/python/tests/files/datacite_1k_records.jsonl.gz differ
diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl
new file mode 100644
index 00000000..dba3e267
--- /dev/null
+++ b/python/tests/files/datacite_sample.jsonl
@@ -0,0 +1 @@
+{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 0bbaba2e..9c542fc6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -1,25 +1,99 @@
 """
 Test datacite importer.
+"""
 
-Datacite is a aggregator, hence inputs are quite varied.
+import datetime
+import pytest
+import gzip
+from fatcat_tools.importers import DataciteImporter, JsonLinePusher
+from fixtures import api
+import json
 
-Here is small sample of ID types taken from a sample:
 
-    497344 "DOI"
-     65013 "URL"
-     22210 "CCDC"
-     17853 "GBIF"
-     17635 "Other"
-     11474 "uri"
-      9170 "Publisher ID"
-      7775 "URN"
-      6196 "DUCHAS"
-      5624 "Handle"
-      5056 "publisherId"
+@pytest.fixture(scope="function")
+def datacite_importer(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=True)
 
-A nice tool, not yet existing tool (maybe named indigo) would do the following:
+@pytest.fixture(scope="function")
+def datacite_importer_existing(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=False)
 
-    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
 
-TODO(martin): Write tests.
-"""
+@pytest.mark.skip(reason="larger datacite import slows tests down")
+def test_datacite_importer_huge(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 998
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert len(release.contribs) == 3
+
+
+def test_datacite_importer(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 1
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "datacite" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = False
+        datacite_importer.reset()
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 1
+    assert counts['skip'] == 0
+    assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
+def test_datacite_dict_parse(datacite_importer):
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        raw = json.load(f)
+        r = datacite_importer.parse_record(raw)
+        # ensure the API server is ok with format
+        JsonLinePusher(datacite_importer, [json.dumps(raw)]).run()
+
+        print(r.extra)
+        assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        assert r.release_type == "article"
+        assert r.release_stage == "published"
+        assert r.license_slug == None
+        assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.ext_ids.doi == "10.18730/8dym9"
+        assert r.ext_ids.isbn13 == None
+        assert r.language == "enc"
+        assert r.subtitle == None
+        assert r.release_date == None
+        assert r.release_year == 1986
+        assert 'subtitle' not in r.extra
+        assert 'subtitle' not in r.extra['datacite']
+        assert 'funder' not in r.extra
+        assert 'funder' not in r.extra['datacite']
+        # matched by ISSN, so shouldn't be in there
+        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+        assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9'
+        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+        assert len(r.abstracts) == 1
+        assert len(r.abstracts[0].content) == 421
+        assert len(r.contribs) == 1
+        assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
+        assert r.contribs[0].given_name == None
+        assert r.contribs[0].surname == None
+        assert len(r.refs) == 0
-- 
cgit v1.2.3


From 013d873c73f374f968559b6b70d9c2575b6dc47e Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 15:36:04 +0100
Subject: datacite: add missing --extid-map-file flag

---
 python/fatcat_import.py | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 90bb01a1..c6c74bc2 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -173,6 +173,7 @@ def run_datacite(args):
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
         lang_detect=args.lang_detect,
+        extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
         KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
@@ -461,6 +462,9 @@ def main():
     sub_datacite.add_argument('issn_map_file',
         help="ISSN to ISSN-L mapping file",
         default=None, type=argparse.FileType('r'))
+    sub_datacite.add_argument('--extid-map-file',
+        help="DOI-to-other-identifiers sqlite3 database",
+        default=None, type=str)
     sub_datacite.add_argument('--kafka-mode',
         action='store_true',
         help="consume from kafka topic (not stdin)")
-- 
cgit v1.2.3


From 91bd7b82608e5e27a10c649cf8205243b8ba96c6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 26 Dec 2019 15:37:13 +0100
Subject: datacite: use specific auth var

---
 python/fatcat_import.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index c6c74bc2..a17029cc 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -483,7 +483,7 @@ def main():
         help="write inserted documents into file (for debugging)")
     sub_datacite.set_defaults(
         func=run_datacite,
-        auth_var="FATCAT_API_AUTH_TOKEN",
+        auth_var="FATCAT_AUTH_WORKER_DATACITE",
     )
 
     args = parser.parse_args()
-- 
cgit v1.2.3


From 391565cbbc0ba17ffd8c4f5d88d4dfda8a8b323c Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 3 Jan 2020 13:46:05 +0100
Subject: datacite: remove --lang-detect flag

Estimated time for a single call is in the order of 50ms.
---
 python/fatcat_import.py                             |  4 ----
 python/fatcat_tools/importers/datacite.py           | 17 ++++++-----------
 python/tests/files/datacite/datacite_result_04.json |  5 +++--
 python/tests/files/datacite/datacite_result_05.json |  5 +++--
 python/tests/files/datacite/datacite_result_07.json |  5 +++--
 python/tests/files/datacite/datacite_result_08.json |  5 +++--
 python/tests/files/datacite/datacite_result_14.json |  5 +++--
 7 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index a17029cc..6b04d547 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -172,7 +172,6 @@ def run_datacite(args):
         edit_batch_size=args.batch_size,
         bezerk_mode=args.bezerk_mode,
         debug=args.debug,
-        lang_detect=args.lang_detect,
         extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
@@ -474,9 +473,6 @@ def main():
     sub_datacite.add_argument('--debug',
         action='store_true',
         help="write converted JSON to stdout")
-    sub_datacite.add_argument('--lang-detect',
-        action='store_true',
-        help="try to detect language (slow)")
     sub_datacite.add_argument('--insert-log-file',
         default='',
         type=str,
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index bd135569..8034a5c1 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -196,7 +196,6 @@ class DataciteImporter(EntityImporter):
                  api,
                  issn_map_file,
                  debug=False,
-                 lang_detect=False,
                  insert_log_file=None,
                  **kwargs):
 
@@ -225,12 +224,9 @@ class DataciteImporter(EntityImporter):
 
         self.read_issn_map_file(issn_map_file)
         self.debug = debug
-        self.lang_detect = lang_detect
         self.insert_log_file = insert_log_file
 
-        print('datacite with debug={}, lang_detect={}'.format(
-            self.debug, self.lang_detect),
-              file=sys.stderr)
+        print('datacite with debug={}'.format(self.debug), file=sys.stderr)
 
     def lookup_ext_ids(self, doi):
         """
@@ -537,12 +533,11 @@ class DataciteImporter(EntityImporter):
             if len(text) > MAX_ABSTRACT_LENGTH:
                 text = text[:MAX_ABSTRACT_LENGTH] + " [...]"
             lang = None
-            if self.lang_detect:
-                try:
-                    lang = langdetect.detect(text)
-                except langdetect.lang_detect_exception.LangDetectException as err:
-                    print('[{}] language detection failed: {}'.format(doi, err),
-                          file=sys.stderr)
+            try:
+                lang = langdetect.detect(text)
+            except langdetect.lang_detect_exception.LangDetectException as err:
+                print('[{}] language detection failed: {}'.format(doi, err),
+                      file=sys.stderr)
             abstracts.append(
                 fatcat_openapi_client.ReleaseAbstract(
                     mimetype="text/plain",
diff --git a/python/tests/files/datacite/datacite_result_04.json b/python/tests/files/datacite/datacite_result_04.json
index 54b19ef9..94fa1f94 100644
--- a/python/tests/files/datacite/datacite_result_04.json
+++ b/python/tests/files/datacite/datacite_result_04.json
@@ -22,7 +22,8 @@
     "abstracts": [
         {
             "content": "Let A be an abelian category, I the full subcategory of A consisting of injective objects of A, and K(A) the category whose objects are cochain complexes of elements of A, and whose morphisms are homotopy classes of cochain maps.  In (5), lemma 4.6., p. 42, R. Hartshorne has proved that, under certain conditions, a cochain complex X\u02d9 \u03b5. |KA)| can be embedded in a complex I\u02d9 \u03b5. |K(I)| in such a way that I\u02d9 has the same cohomology as X\u02d9.  In Chapter I we show that the construction given in the two first parts of Hartshorne's Lemma is natural i.e. there exists a functor  J : K(A) \u2192 K(I) and a natural transformation [formula omitted]  (where E : K(I) \u2192 K(A) is the embedding functor) such that [formula omitted] is  injective and induces isomorphism in cohomology. The question whether the construction given in the third part of the lemma is functorial is still open.  We also prove that J is left adjoint to E, so that K(I) is a reflective subcategory of K(A).  In the special case where A is a category [formula omitted] of left A-modules, and [formula omitted] the category of cochain complexes in [formula omitted] and cochain maps (not homotopy classes), we prove the existence of a functor [formula omitted]  In Chapter II we study the natural homomorphism [formula omitted]   where A, B are rings, and M, L, N modules or chain complexes. In particular we give several sufficient conditions under which v is an isomorphism, or induces isomorphism in homology.  In the appendix we give a detailed proof of Hartshorne's Lemma. We think that this is useful, as no complete proof is, to our knowledge, to be found in the literature.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_05.json b/python/tests/files/datacite/datacite_result_05.json
index a790c26e..ff998c0f 100644
--- a/python/tests/files/datacite/datacite_result_05.json
+++ b/python/tests/files/datacite/datacite_result_05.json
@@ -524,7 +524,8 @@
     "abstracts": [
         {
             "content": "UNITE provides a unified way for delimiting, identifying, communicating, and working with DNA-based Species Hypotheses (SH). All fungal ITS sequences in the international nucleotide sequence databases are clustered to approximately the species level by applying a set of dynamic distance values (&lt;0.5 - 3.0%). All species hypotheses are given a unique, stable name in the form of a DOI, and their taxonomic and ecological annotations are verified through distributed, web-based third-party annotation efforts. SHs are connected to a taxon name and its classification as far as possible (phylum, class, order, etc.) by taking into account identifications for all sequences in the SH. An automatically or manually designated sequence is chosen to represent each such SH. These sequences are released (https://unite.ut.ee/repository.php) for use by the scientific community in, for example, local sequence similarity searches and next-generation sequencing analysis pipelines. The system and the data are updated automatically as the number of public fungal ITS sequences grows.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_07.json b/python/tests/files/datacite/datacite_result_07.json
index f572263c..f694ddef 100644
--- a/python/tests/files/datacite/datacite_result_07.json
+++ b/python/tests/files/datacite/datacite_result_07.json
@@ -67,7 +67,8 @@
     "abstracts": [
         {
             "content": "The purpose of the ISEC concept is to provide a high-efficient heat pump system for hot water production. The ISEC concept uses two storage tanks for the water, one discharged and one charged. Hot water for the industrial process is tapped from the charged tank, while the other tank is charging. Charging is done by circulating the water in the tank through the condenser of a heat pump several times and thereby gradually heating the water. The charging is done with a higher mass flow rate than the discharging to reach several circulations of the water during the time frame of one discharging. This result in a lower condensing temperature than if the water was heated in one step. Two test setups were built, one to test the performance of the heat pump gradually heating the water and one to investigate the stratification in the storage tanks. Furthermore, a dynamic model of the system was implemented in Dymola, and validated by the use of test data from the two experimental setups. This paper shows that there is a good consistency between the model and the experimental tests.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_08.json b/python/tests/files/datacite/datacite_result_08.json
index 581ca1eb..cc0e968b 100644
--- a/python/tests/files/datacite/datacite_result_08.json
+++ b/python/tests/files/datacite/datacite_result_08.json
@@ -47,7 +47,8 @@
     "abstracts": [
         {
             "content": "International society recognizes that the scarcity of fresh water is increasing and farming sectors suffer from lack of irrigation water. However, if we look at this issue with a framework of relative factor endowment, a different view will arise. In emerging states with rapid industrialization and labor migration, labor scarcity increases at a faster pace than that of irrigation water. Using the historical review of Japan\u2019s irrigation policies as well as the case studies of India and China, this paper shows that the introduction of policies which do not reflect the actual relative resource scarcity may mislead the development path. We argue that under increasing relative labor scarcity it is important to realize the substitution of capital for labor for surface irrigation system management and that the substitution needs public support because the service of surface irrigation system has some externalities. Through this argument, this paper also intends to shed the light back to the role of the state for local resource management which seems to be unfairly undervalued since the boom of community participatory approach in the 1980s.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/python/tests/files/datacite/datacite_result_14.json b/python/tests/files/datacite/datacite_result_14.json
index 94ad000a..4521f891 100644
--- a/python/tests/files/datacite/datacite_result_14.json
+++ b/python/tests/files/datacite/datacite_result_14.json
@@ -104,7 +104,8 @@
     "abstracts": [
         {
             "content": "An entry from the Cambridge Structural Database, the world\u2019s repository for small molecule crystal structures. The entry contains experimental data from a crystal diffraction study. The deposited dataset for this entry is freely available from the CCDC and typically includes 3D coordinates, cell parameters, space group, experimental conditions and quality measures.",
-            "mimetype": "text/plain"
+            "mimetype": "text/plain",
+            "lang": "en"
         }
     ]
-}
\ No newline at end of file
+}
-- 
cgit v1.2.3


From d3a1382795d14ac77165fa6eb39e893b03b97215 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Jan 2020 01:57:47 +0100
Subject: datacite: fix typos

---
 python/fatcat_import.py                   | 2 +-
 python/fatcat_tools/importers/datacite.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'python/fatcat_import.py')

diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 6b04d547..ea7e12f2 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -175,7 +175,7 @@ def run_datacite(args):
         extid_map_file=args.extid_map_file,
         insert_log_file=args.insert_log_file)
     if args.kafka_mode:
-        KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
+        KafkaJsonPusher(dci, args.kafka_hosts, args.kafka_env, "api-datacite",
             "fatcat-import", consume_batch_size=args.batch_size).run()
     else:
         JsonLinePusher(dci, args.json_file).run()
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 936b6f1b..53f46bb4 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -699,7 +699,7 @@ class DataciteImporter(EntityImporter):
         if self.insert_log_file:
             with open(self.insert_log_file, 'a') as f:
                 for doc in batch:
-                    json.dump(entity_to_dict(re, api_client=None), f)
+                    json.dump(entity_to_dict(doc, api_client=None), f)
                     f.write('\n')
         self.api.create_release_auto_batch(
             fatcat_openapi_client.ReleaseAutoBatch(
-- 
cgit v1.2.3