5 files changed, 245 insertions, 59 deletions
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index d7651792..90bb01a1 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -170,7 +170,10 @@ def run_datacite(args):
     dci = DataciteImporter(args.api,
         args.issn_map_file,
         edit_batch_size=args.batch_size,
-        bezerk_mode=args.bezerk_mode)
+        bezerk_mode=args.bezerk_mode,
+        debug=args.debug,
+        lang_detect=args.lang_detect,
+        insert_log_file=args.insert_log_file)
     if args.kafka_mode:
         KafkaJsonPusher(fci, args.kafka_hosts, args.kafka_env, "api-datacite",
             "fatcat-import", consume_batch_size=args.batch_size).run()
@@ -464,6 +467,16 @@ def main():
     sub_datacite.add_argument('--bezerk-mode',
         action='store_true',
         help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")
+    sub_datacite.add_argument('--debug',
+        action='store_true',
+        help="write converted JSON to stdout")
+    sub_datacite.add_argument('--lang-detect',
+        action='store_true',
+        help="try to detect language (slow)")
+    sub_datacite.add_argument('--insert-log-file',
+        default='',
+        type=str,
+        help="write inserted documents into file (for debugging)")
     sub_datacite.set_defaults(
         func=run_datacite,
         auth_var="FATCAT_API_AUTH_TOKEN",
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 4e117dde..9774e334 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -6,13 +6,14 @@ Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8
 
 from .common import EntityImporter
 import dateparser
-import langcodes
 import datetime
-import langdetect
 import fatcat_openapi_client
+import hashlib
 import json
+import langcodes
+import langdetect
+import sqlite3
 import sys
-import hashlib
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -147,10 +148,11 @@ LICENSE_SLUG_MAP = {
 
 class DataciteImporter(EntityImporter):
     """
-    Importer for datacite records. TODO(martin): Do we need issn_map_file?
+    Importer for datacite records.
     """
 
-    def __init__(self, api, issn_map_file, **kwargs):
+    def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
+                 insert_log_file=None, **kwargs):
 
         eg_desc = kwargs.get('editgroup_description',
             "Automated import of Datacite DOI metadata, harvested from REST API")
@@ -163,7 +165,42 @@ class DataciteImporter(EntityImporter):
             **kwargs)
 
         self.create_containers = kwargs.get('create_containers', True)
+        extid_map_file = kwargs.get('extid_map_file')
+        self.extid_map_db = None
+        if extid_map_file:
+            db_uri = "file:{}?mode=ro".format(extid_map_file)
+            print("Using external ID map: {}".format(db_uri), file=sys.stderr)
+            self.extid_map_db = sqlite3.connect(db_uri, uri=True)
+        else:
+            print("Not using external ID map", file=sys.stderr)
+
         self.read_issn_map_file(issn_map_file)
+        self.debug = debug
+        self.lang_detect = lang_detect
+        self.insert_log_file = insert_log_file
+
+        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+
+    def lookup_ext_ids(self, doi):
+        """
+        Return dictionary of identifiers refering to the same things as the given DOI.
+        """
+        if self.extid_map_db is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            [doi.lower()]).fetchone()
+        if row is None:
+            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+        row = [str(cell or '') or None for cell in row]
+        return dict(
+            core_id=row[0],
+            pmid=row[1],
+            pmcid=row[2],
+            wikidata_qid=row[3],
+            # TODO:
+            arxiv_id=None,
+            jstor_id=None,
+        )
 
     def parse_record(self, obj):
         """
@@ -174,14 +211,14 @@ class DataciteImporter(EntityImporter):
 
         attributes = obj['attributes']
 
-        # Contributors. Many nameIdentifierSchemes, we do not use yet:
-        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [
-        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS",
-        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],
+        # Contributors. Many nameIdentifierSchemes, we do not use (yet):
+        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme":
+        # ["LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID",
+        # "SCOPUS", "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID"].
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
-            if not c.get('nameType') == 'Personal':
+            if 'nameType' in c and not c.get('nameType') == 'Personal':
                 continue
             creator_id = None
             for nid in c.get('nameIdentifiers', []):
@@ -191,7 +228,7 @@ class DataciteImporter(EntityImporter):
                 if not orcid:
                     continue
                 creator_id = self.lookup_orcid(orcid)
-                # If creator_id is None, should we create creators?
+                # TODO(martin): If creator_id is None, should we create creators?
             contribs.append(fatcat_openapi_client.ReleaseContrib(
                 creator_id=creator_id,
                 index=i,
@@ -204,11 +241,27 @@ class DataciteImporter(EntityImporter):
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
         title, subtitle = None, None
 
-        for entry in attributes.get('titles', []):
-            if not title and 'titleType' not in entry:
-                title = entry.get('title').strip()
-            if entry.get('titleType') == 'Subtitle':
-                subtitle = entry.get('title').strip()
+        titles = attributes.get('titles', []) or []
+        if len(titles) == 0:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+        elif len(titles) == 1:
+            # We do not care about the type then.
+            title = titles[0].get('title', '') or ''
+            title = title.strip()
+        else:
+            for entry in titles:
+                if not title and ('titleType' not in entry or not entry.get('titleType')):
+                    title = entry.get('title').strip()
+                if entry.get('titleType') == 'Subtitle':
+                    subtitle = entry.get('title', '').strip()
+
+        if not title:
+            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
+            return False
+
+        if not subtitle:
+            subtitle = None
 
         # Dates. A few internal dates (registered, created, updated) and
         # published (0..2554). We try to work with typed date list, in
@@ -217,14 +270,13 @@ class DataciteImporter(EntityImporter):
         # "Updated", "Valid".
         release_year, release_date = None, None
 
+        # Ignore: Collected, Issued.
         date_type_prio = (
             'Valid',
-            'Issued',
             'Available',
             'Accepted',
             'Submitted',
             'Copyrighted',
-            'Collected',
             'Created',
             'Updated',
         )
@@ -233,15 +285,36 @@ class DataciteImporter(EntityImporter):
             for item in dates:
                 if not item.get('dateType') == prio:
                     continue
-                try:
-                    result = dateparser.parse(item.get('date'))
-                except TypeError as err:
-                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr)
-                    continue
+
+                # Parse out date, use common patterns first, fallback to dateparser.
+                result, value, year_only = None, item.get('date', ''), False
+
+                # Before using (expensive) dateparser, try a few common patterns.
+                common_patterns = ('%Y-%m-%d', '%Y', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S')
+
+                for pattern in common_patterns:
+                    try:
+                        result = datetime.datetime.strptime(value, pattern)
+                    except ValueError:
+                        continue
+                    else:
+                        if pattern == '%Y':
+                            year_only = True
+                        break
+
+                if result is None:
+                    print('fallback for {}'.format(value), file=sys.stderr)
+                    try:
+                        result = dateparser.parse(value)
+                    except TypeError as err:
+                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
+                        continue
+
                 if result is None:
                     # Unparsable date.
                     continue
-                release_date = result
+                if not year_only:
+                    release_date = result.date()
                 release_year = result.year
                 if 1000 < release_year < datetime.date.today().year + 5:
                     # Skip possibly bogus dates.
@@ -280,10 +353,16 @@ class DataciteImporter(EntityImporter):
                     container_id = self.lookup_issnl(issnl)
 
                     if container_id is None and container.get('title'):
+                        container_title = container.get('title')
+                        if isinstance(container_title, list):
+                            if len(container_title) > 0:
+                                print('too many container titles: {}'.format(len(container_title)))
+                                container_title = container_title[0]
+                        assert isinstance(container_title, str)
                         ce = fatcat_openapi_client.ContainerEntity(
                             issnl=issnl,
                             container_type=container_type,
-                            name=container.get('title'),
+                            name=container_title,
                         )
                         ce_edit = self.create_container(ce)
                         container_id = ce_edit.ident
@@ -326,12 +405,12 @@ class DataciteImporter(EntityImporter):
         # closest, but not always supplied.
         for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
             value = attributes.get('types', {}).get(typeType)
-            release_type = DATACITE_TYPE_MAP.get(value)
+            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
                 break
 
         if release_type is None:
-            print("datacite unmapped type: {}".format(release_type), file=sys.stderr)
+            print("no mapped type: {}".format(value), file=sys.stderr)
 
         # Language values are varied ("ger", "es", "English", "ENG", "en-us",
         # "other", ...). Try to crush it with langcodes: "It may sound to you
@@ -347,7 +426,7 @@ class DataciteImporter(EntityImporter):
             try:
                 language = langcodes.get(value).language
             except langcodes.tag_parser.LanguageTagError:
-                print('could not determine language: {}'.format(value), file=sys.stderr)
+                pass
 
         # Abstracts appear in "attributes.descriptions[].descriptionType", some
         # of the observed values: "Methods", "TechnicalInfo",
@@ -355,8 +434,8 @@ class DataciteImporter(EntityImporter):
         # "Other" fields might contain references or related articles (with
         # DOI). TODO(martin): maybe try to parse out some of those refs.
         abstracts = []
-
-        for desc in attributes.get('descriptions', []):
+        descs = attributes.get('descriptions', []) or []
+        for desc in descs:
             if not desc.get('descriptionType') == 'Abstract':
                 continue
             if len(desc.get('description', '')) < 10:
@@ -364,10 +443,11 @@ class DataciteImporter(EntityImporter):
             text = desc.get('description')
             sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
             lang = None
-            try:
-                lang = langdetect.detect(text)
-            except langdetect.lang_detect_exception.LangDetectException:
-                pass
+            if self.lang_detect:
+                try:
+                    lang = langdetect.detect(text)
+                except langdetect.lang_detect_exception.LangDetectException as err:
+                    print('language detection failed: {}'.format(err), file=sys.stderr)
             abstracts.append(fatcat_openapi_client.ReleaseAbstract(
                 mimetype="text/plain",
                 content=text,
@@ -386,7 +466,8 @@ class DataciteImporter(EntityImporter):
         # For the moment, we only care about References.
         refs, ref_index = [], 0
 
-        for rel in attributes.get('relatedIdentifiers', []):
+        relIds = attributes.get('relatedIdentifiers', []) or []
+        for rel in relIds:
             if not rel.get('relationType') == 'References':
                 continue
             ref_extra = dict()
@@ -422,6 +503,9 @@ class DataciteImporter(EntityImporter):
         if extra_datacite:
             extra['datacite'] = extra_datacite
 
+        doi = attributes.get('doi', '').lower()
+        extids = self.lookup_ext_ids(doi=doi)
+
         # Assemble release.
         re = fatcat_openapi_client.ReleaseEntity(
             work_id=None,
@@ -435,7 +519,13 @@ class DataciteImporter(EntityImporter):
             release_date=release_date,
             publisher=publisher,
             ext_ids=fatcat_openapi_client.ReleaseExtIds(
-                doi=attributes.get('doi'),
+                doi=doi,
+                pmid=extids['pmid'],
+                pmcid=extids['pmcid'],
+                wikidata_qid=extids['wikidata_qid'],
+                core=extids['core_id'],
+                arxiv=extids['arxiv_id'],
+                jstor=extids['jstor_id'],
             ),
             contribs=contribs,
             volume=volume,
@@ -449,11 +539,12 @@ class DataciteImporter(EntityImporter):
         )
         return re
 
-    def try_update(self, re, debug=True):
+    def try_update(self, re):
         """
-        When debug is true, write the RE to stdout.
+        When debug is true, write the RE to stdout, not to the database. Might
+        hide schema mismatch bugs.
         """
-        if debug is True:
+        if self.debug is True:
             print(json.dumps(re.to_dict(), default=extended_json_encoder))
             return False
 
@@ -476,10 +567,16 @@ class DataciteImporter(EntityImporter):
         return True
 
     def insert_batch(self, batch):
+        print('inserting batch ({})'.format(len(batch)), file=sys.stderr)
+        if self.insert_log_file:
+            with open(self.insert_log_file, 'a') as f:
+                for doc in batch:
+                    json.dump(doc.to_dict(), f, default=extended_json_encoder)
+                    f.write('\n')
         self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
             editgroup=fatcat_openapi_client.Editgroup(
-                description=self.editgroup_description,
-                extra=self.editgroup_extra),
+            description=self.editgroup_description,
+            extra=self.editgroup_extra),
             entity_list=batch))
 
 def extended_json_encoder(value):
@@ -491,6 +588,7 @@ def extended_json_encoder(value):
         return value.isoformat()
     if isinstance(value, set):
         return list(value)
+    raise TypeError('cannot encode type: {}'.format(type(value)))
 
 def lookup_license_slug(raw):
     """
diff --git a/python/tests/files/datacite_1k_records.jsonl.gz b/python/tests/files/datacite_1k_records.jsonl.gz
new file mode 100644
index 00000000..28ea6e37
--- /dev/null
+++ b/python/tests/files/datacite_1k_records.jsonl.gz
diff --git a/python/tests/files/datacite_sample.jsonl b/python/tests/files/datacite_sample.jsonl
new file mode 100644
index 00000000..dba3e267
--- /dev/null
+++ b/python/tests/files/datacite_sample.jsonl
@@ -0,0 +1 @@
+{"id":"10.18730/8dym9","type":"dois","attributes":{"doi":"10.18730/8dym9","identifiers":[{"identifier":"https://doi.org/10.18730/8dym9","identifierType":"DOI"},{"identifier":"ICDW 20791","identifierType":"Other"}],"creators":[{"name":"GLIS Of The ITPGRFA","affiliation":[]}],"titles":[{"title":"Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"}],"publisher":"International Centre for Agricultural Research in Dry Areas","container":{},"publicationYear":2017,"subjects":[{"subject":"Plant Genetic Resource for Food and Agriculture"}],"contributors":[{"name":"International Centre For Agricultural Research In Dry Areas","affiliation":[]}],"dates":[{"date":"1986","dateType":"Accepted"},{"date":"1978-06-03","dateType":"Collected"},{"date":"2017","dateType":"Issued"}],"language":"en","types":{"ris":"GEN","bibtex":"misc","citeproc":"article","schemaOrg":"CreativeWork","resourceType":"PGRFA Material","resourceTypeGeneral":"PhysicalObject"},"relatedIdentifiers":[{"schemeUri":"http://www.fao.org/plant-treaty/areas-of-work/global-information-system/descriptors","schemeType":"XML","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"GLIS Descriptors"},{"schemeUri":"http://rs.tdwg.org/dwc/terms/guides/text/index.htm","schemeType":"DwC-A","relationType":"HasMetadata","relatedIdentifier":"https://ssl.fao.org/glisapi/v1/pgrfas?_format=dwc&doi=10.18730/8DYM9","relatedIdentifierType":"URL","relatedMetadataScheme":"Darwin Core Archive"}],"sizes":[],"formats":[],"version":null,"rightsList":[],"descriptions":[{"description":"Plant Genetic Resource.<br>Taxonomy: Triticum turgidum L. subsp. durum (Desf.) Husn.<br>Common name(s): Wheat<br>Conserved by: International Centre for Agricultural Research in Dry Areas (ICARDA), Lebanon<br>Local sample unique identifier: 97090<br>Method of creation: Acquisition<br>Date: 1986<br>Biological status: Traditional cultivar/landrace<br>Other identifiers: ICDW 20791<br>MLS status: Included<br>Historical: No","descriptionType":"Abstract"}],"geoLocations":[{"geoLocationPlace":"Collecting site","geoLocationPoint":{"pointLatitude":"35.5","pointLongitude":"23.7333"}}],"fundingReferences":[],"url":"https://ssl.fao.org/glis/doi/10.18730/8DYM9","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"mds","isActive":true,"state":"findable","reason":null,"created":"2017-11-11T12:26:01.000Z","registered":"2017-11-11T12:26:02.000Z","published":"2017","updated":"2019-08-02T16:34:56.000Z"},"relationships":{"client":{"data":{"id":"fao.itpgrfa","type":"clients"}}}}
diff --git a/python/tests/import_datacite.py b/python/tests/import_datacite.py
index 0bbaba2e..9c542fc6 100644
--- a/python/tests/import_datacite.py
+++ b/python/tests/import_datacite.py
@@ -1,25 +1,99 @@
 """
 Test datacite importer.
+"""
 
-Datacite is a aggregator, hence inputs are quite varied.
+import datetime
+import pytest
+import gzip
+from fatcat_tools.importers import DataciteImporter, JsonLinePusher
+from fixtures import api
+import json
 
-Here is small sample of ID types taken from a sample:
 
-    497344 "DOI"
-     65013 "URL"
-     22210 "CCDC"
-     17853 "GBIF"
-     17635 "Other"
-     11474 "uri"
-      9170 "Publisher ID"
-      7775 "URN"
-      6196 "DUCHAS"
-      5624 "Handle"
-      5056 "publisherId"
+@pytest.fixture(scope="function")
+def datacite_importer(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=True)
 
-A nice tool, not yet existing tool (maybe named indigo) would do the following:
+@pytest.fixture(scope="function")
+def datacite_importer_existing(api):
+    with open('tests/files/ISSN-to-ISSN-L.snip.txt', 'r') as issn_file:
+        yield DataciteImporter(api, issn_file, extid_map_file='tests/files/example_map.sqlite3',
+                               bezerk_mode=False)
 
-    $ shuf -n 100000 datacite.ndjson | indigo -t md > data.md
 
-TODO(martin): Write tests.
-"""
+@pytest.mark.skip(reason="larger datacite import slows tests down")
+def test_datacite_importer_huge(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with gzip.open('tests/files/datacite_1k_records.jsonl.gz', 'rt') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 998
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    release = datacite_importer.api.get_release(change.editgroup.edits.releases[0].ident)
+    assert len(release.contribs) == 3
+
+
+def test_datacite_importer(datacite_importer):
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = True
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 1
+    assert counts['exists'] == 0
+    assert counts['skip'] == 0
+
+    # fetch most recent editgroup
+    change = datacite_importer.api.get_changelog_entry(index=last_index+1)
+    eg = change.editgroup
+    assert eg.description
+    assert "datacite" in eg.description.lower()
+    assert eg.extra['git_rev']
+    assert "fatcat_tools.DataciteImporter" in eg.extra['agent']
+
+    last_index = datacite_importer.api.get_changelog(limit=1)[0].index
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        datacite_importer.bezerk_mode = False
+        datacite_importer.reset()
+        counts = JsonLinePusher(datacite_importer, f).run()
+    assert counts['insert'] == 0
+    assert counts['exists'] == 1
+    assert counts['skip'] == 0
+    assert last_index == datacite_importer.api.get_changelog(limit=1)[0].index
+
+def test_datacite_dict_parse(datacite_importer):
+    with open('tests/files/datacite_sample.jsonl', 'r') as f:
+        raw = json.load(f)
+        r = datacite_importer.parse_record(raw)
+        # ensure the API server is ok with format
+        JsonLinePusher(datacite_importer, [json.dumps(raw)]).run()
+
+        print(r.extra)
+        assert r.title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.publisher == "International Centre for Agricultural Research in Dry Areas"
+        assert r.release_type == "article"
+        assert r.release_stage == "published"
+        assert r.license_slug == None
+        assert r.original_title == "Triticum turgidum L. subsp. durum (Desf.) Husn. 97090"
+        assert r.ext_ids.doi == "10.18730/8dym9"
+        assert r.ext_ids.isbn13 == None
+        assert r.language == "enc"
+        assert r.subtitle == None
+        assert r.release_date == None
+        assert r.release_year == 1986
+        assert 'subtitle' not in r.extra
+        assert 'subtitle' not in r.extra['datacite']
+        assert 'funder' not in r.extra
+        assert 'funder' not in r.extra['datacite']
+        # matched by ISSN, so shouldn't be in there
+        #assert extra['container_name'] == "International Journal of Quantum Chemistry"
+        assert r.extra['datacite']['url'] == 'https://ssl.fao.org/glis/doi/10.18730/8DYM9'
+        assert r.extra['datacite']['subjects'] == [{'subject': 'Plant Genetic Resource for Food and Agriculture'}]
+        assert len(r.abstracts) == 1
+        assert len(r.abstracts[0].content) == 421
+        assert len(r.contribs) == 1
+        assert r.contribs[0].raw_name == "GLIS Of The ITPGRFA"
+        assert r.contribs[0].given_name == None
+        assert r.contribs[0].surname == None
+        assert len(r.refs) == 0