address first round of MR14 comments

* add missing langdetect * use entity_to_dict for json debug output * factor out code for fields in function and add table driven tests * update citeproc types * add author as default role * add raw_affiliation * include relations from datacite * remove url (covered by doi already) Using yapf for python formatting.
author: Martin Czygan <martin.czygan@gmail.com> 2019-12-21 23:30:56 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2019-12-28 23:07:31 +0100
commit: a196435a0e88f85785742cdd089344f97401b43a (patch)
tree: 056dceaa4ccd567096b2d3e789efdd573682a8c3 /python/fatcat_tools/importers
parent: 52eabd48658a676ac4577d1c8da31df1fe58093e (diff)
download: fatcat-a196435a0e88f85785742cdd089344f97401b43a.tar.gz
fatcat-a196435a0e88f85785742cdd089344f97401b43a.zip
1 files changed, 319 insertions, 148 deletions
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index 77ce1012..19b89edf 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -14,6 +14,7 @@ import langcodes
 import langdetect
 import sqlite3
 import sys
+from fatcat_tools.transforms import entity_to_dict
 
 # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary
 CONTAINER_TYPE_MAP = {
@@ -55,16 +56,42 @@ DATACITE_TYPE_MAP = {
         'Thesis': 'thesis',
     },
     'citeproc': {
-        'dataset': 'dataset',
-        'chapter': 'chapter',
-        'article-journal': 'article-journal',
-        'song': 'song',
         'article': 'article',
-        'report': 'report',
+        'article-journal': 'article-journal',
+        'article-magazine': 'article-magazine',
+        'article-newspaper': 'article-newspaper',
+        'bill': 'bill',
+        'book': 'book',
+        'broadcast': 'broadcast',
+        'chapter': 'chapter',
+        'dataset': 'dataset',
+        'entry-dictionary': 'entry-dictionary',
+        'entry-encyclopedia': 'entry-encyclopedia',
+        'entry': 'entry',
+        'figure': 'figure',
         'graphic': 'graphic',
+        'interview': 'interview',
+        'legal_case': 'legal_case',
+        'legislation': 'legislation',
+        'manuscript': 'manuscript',
+        'map': 'map',
+        'motion_picture': 'motion_picture',
+        'musical_score': 'musical_score',
+        'pamphlet': 'pamphlet',
+        'paper-conference': 'paper-conference',
+        'patent': 'patent',
+        'personal_communication': 'personal_communication',
+        'post': 'post',
+        'post-weblog': 'post-weblog',
+        'report': 'report',
+        'review-book': 'review-book',
+        'review': 'review',
+        'song': 'song',
+        'speech': 'speech',
         'thesis': 'thesis',
-        'book': 'book',
-    },
+        'treaty': 'treaty',
+        'webpage': 'webpage',
+    },  # https://docs.citationstyles.org/en/master/specification.html#appendix-iii-types
     'bibtex': {
         'phdthesis': 'thesis',
         'inbook': 'chapter',
@@ -88,7 +115,6 @@ DATACITE_TYPE_MAP = {
     }
 }
 
-
 # TODO(martin): merge this with other maps, maybe.
 LICENSE_SLUG_MAP = {
     "//creativecommons.org/licenses/by/2.0/": "CC-BY",
@@ -124,7 +150,8 @@ LICENSE_SLUG_MAP = {
     "//www.karger.com/Services/SiteLicenses": "KARGER",
     "//www.opensource.org/licenses/Apache-2.0": "Apache-2.0",
     "//www.opensource.org/licenses/BSD-3-Clause": "BSD-3-Clause",
-    "//www.opensource.org/licenses/EUPL-1.1": "EUPL-1.1", # redirects to EUPL-1.2
+    "//www.opensource.org/licenses/EUPL-1.1":
+    "EUPL-1.1",  # redirects to EUPL-1.2
     "//www.opensource.org/licenses/MIT": "MIT",
     # "http://royalsocietypublishing.org/licence": "", # OA and "normal", https://royalsociety.org/journals/authors/licence-to-publish/
     # "http://rsc.li/journals-terms-of-use": "RSC",
@@ -146,23 +173,31 @@ LICENSE_SLUG_MAP = {
     # Note: Some URLs pointing to licensing terms are not in WB yet (but would be nice).
 }
 
+
 class DataciteImporter(EntityImporter):
     """
     Importer for datacite records.
     """
-
-    def __init__(self, api, issn_map_file, debug=False, lang_detect=False,
-                 insert_log_file=None, **kwargs):
-
-        eg_desc = kwargs.get('editgroup_description',
-            "Automated import of Datacite DOI metadata, harvested from REST API")
+    def __init__(self,
+                 api,
+                 issn_map_file,
+                 debug=False,
+                 lang_detect=False,
+                 insert_log_file=None,
+                 **kwargs):
+
+        eg_desc = kwargs.get(
+            'editgroup_description',
+            "Automated import of Datacite DOI metadata, harvested from REST API"
+        )
         eg_extra = kwargs.get('editgroup_extra', dict())
-        eg_extra['agent'] = eg_extra.get('agent', 'fatcat_tools.DataciteImporter')
+        eg_extra['agent'] = eg_extra.get('agent',
+                                         'fatcat_tools.DataciteImporter')
         super().__init__(api,
-            issn_map_file=issn_map_file,
-            editgroup_description=eg_desc,
-            editgroup_extra=eg_extra,
-            **kwargs)
+                         issn_map_file=issn_map_file,
+                         editgroup_description=eg_desc,
+                         editgroup_extra=eg_extra,
+                         **kwargs)
 
         self.create_containers = kwargs.get('create_containers', True)
         extid_map_file = kwargs.get('extid_map_file')
@@ -179,18 +214,31 @@ class DataciteImporter(EntityImporter):
         self.lang_detect = lang_detect
         self.insert_log_file = insert_log_file
 
-        print('datacite with debug={}, lang_detect={}'.format(self.debug, self.lang_detect), file=sys.stderr)
+        print('datacite with debug={}, lang_detect={}'.format(
+            self.debug, self.lang_detect),
+              file=sys.stderr)
 
     def lookup_ext_ids(self, doi):
         """
         Return dictionary of identifiers refering to the same things as the given DOI.
         """
         if self.extid_map_db is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
-        row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
+        row = self.extid_map_db.execute(
+            "SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1",
             [doi.lower()]).fetchone()
         if row is None:
-            return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None, arxiv_id=None, jstor_id=None)
+            return dict(core_id=None,
+                        pmid=None,
+                        pmcid=None,
+                        wikidata_qid=None,
+                        arxiv_id=None,
+                        jstor_id=None)
         row = [str(cell or '') or None for cell in row]
         return dict(
             core_id=row[0],
@@ -206,6 +254,8 @@ class DataciteImporter(EntityImporter):
         """
         Mapping datacite JSON to ReleaseEntity.
         """
+        if not obj or not isinstance(obj, dict):
+            return None
         if 'attributes' not in obj:
             return None
 
@@ -218,43 +268,54 @@ class DataciteImporter(EntityImporter):
         contribs = []
 
         for i, c in enumerate(attributes['creators']):
-            if 'nameType' in c and not c.get('nameType') == 'Personal':
-                continue
-            creator_id = None
-            for nid in c.get('nameIdentifiers', []):
-                if not nid.get('nameIdentifierScheme').lower() == "orcid":
+            nameType = c.get('nameType', '') or ''
+            if nameType == 'Personal':
+                creator_id = None
+                for nid in c.get('nameIdentifiers', []):
+                    if not nid.get('nameIdentifierScheme').lower() == "orcid":
+                        continue
+                    orcid = nid.get('nameIdentifier',
+                                    '').replace('https://orcid.org/', '')
+                    if not orcid:
+                        continue
+                    creator_id = self.lookup_orcid(orcid)
+                    # TODO(martin): If creator_id is None, should we create creators?
+
+                # If there are multiple affiliation strings, use the first one.
+                affiliations = c.get('affiliation', []) or []
+                raw_affiliation = None
+                if len(affiliations) == 0:
+                    raw_affiliation = None
+                else:
+                    raw_affiliation = affiliations[0]
+
+                contribs.append(
+                    fatcat_openapi_client.ReleaseContrib(
+                        creator_id=creator_id,
+                        index=i,
+                        raw_name=c.get('name'),
+                        given_name=c.get('givenName'),
+                        surname=c.get('familyName'),
+                        role='author',
+                        raw_affiliation=raw_affiliation,
+                    ))
+            elif nameType == 'Organizational':
+                name = c.get('name', '') or ''
+                if name == 'NN':
                     continue
-                orcid = nid.get('nameIdentifier', '').replace('https://orcid.org/', '')
-                if not orcid:
+                if len(name) < 3:
                     continue
-                creator_id = self.lookup_orcid(orcid)
-                # TODO(martin): If creator_id is None, should we create creators?
-            contribs.append(fatcat_openapi_client.ReleaseContrib(
-                creator_id=creator_id,
-                index=i,
-                raw_name=c.get('name'),
-                given_name=c.get('givenName'),
-                surname=c.get('familyName'),
-            ))
+                extra = {'organization': name}
+                contribs.append(fatcat_openapi_client.ReleaseContrib(
+                    index=i, extra=extra))
+            else:
+                print('unknown name type: {}'.format(nameType), file=sys.stderr)
 
         # Title, may come with "attributes.titles[].titleType", like
         # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"
-        title, subtitle = None, None
-
         titles = attributes.get('titles', []) or []
-        if len(titles) == 0:
-            print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
-            return False
-        elif len(titles) == 1:
-            # We do not care about the type then.
-            title = titles[0].get('title', '') or ''
-            title = title.strip()
-        else:
-            for entry in titles:
-                if not title and ('titleType' not in entry or not entry.get('titleType')):
-                    title = entry.get('title').strip()
-                if entry.get('titleType') == 'Subtitle':
-                    subtitle = entry.get('title', '').strip()
+        title, original_language_title, subtitle = parse_datacite_titles(
+            titles)
 
         if not title:
             print('skipping record w/o title: {}'.format(obj), file=sys.stderr)
@@ -268,67 +329,14 @@ class DataciteImporter(EntityImporter):
         # "attributes.dates[].dateType", values: "Accepted", "Available"
         # "Collected", "Copyrighted", "Created", "Issued", "Submitted",
         # "Updated", "Valid".
-        release_year, release_date = None, None
-
-        # Ignore: Collected, Issued.
-        date_type_prio = (
-            'Valid',
-            'Available',
-            'Accepted',
-            'Submitted',
-            'Copyrighted',
-            'Created',
-            'Updated',
-        )
-
-        # Before using (expensive) dateparser, try a few common patterns.
-        common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S', '%Y')
-
-        for prio in date_type_prio:
-            dates = attributes.get('dates', []) or [] # Never be None.
-            for item in dates:
-                if not item.get('dateType') == prio:
-                    continue
-
-                # Parse out date, use common patterns first, fallback to dateparser.
-                result, value, year_only = None, item.get('date', ''), False
-
-                for pattern in common_patterns:
-                    try:
-                        result = datetime.datetime.strptime(value, pattern)
-                    except ValueError:
-                        continue
-                    else:
-                        if pattern == '%Y':
-                            year_only = True
-                        break
-
-                if result is None:
-                    print('fallback for {}'.format(value), file=sys.stderr)
-                    try:
-                        result = dateparser.parse(value)
-                    except TypeError as err:
-                        print("{} date parsing failed with: {}".format(value, err), file=sys.stderr)
-                        continue
-
-                if result is None:
-                    # Unparsable date.
-                    continue
-                if not year_only:
-                    release_date = result.date()
-                release_year = result.year
-                if 1000 < release_year < datetime.date.today().year + 5:
-                    # Skip possibly bogus dates.
-                    continue
-                break
-            else:
-                continue
-            break
+        release_date, release_year = parse_datacite_dates(
+            attributes.get('dates', []))
 
         # Publisher. A few NA values. A few bogus values.
         publisher = attributes.get('publisher')
 
-        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):
+        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)',
+                         '(:none)'):
             publisher = None
         if publisher is not None and len(publisher) > 80:
             # Arbitrary magic value max length. TODO(martin): better heuristic,
@@ -345,7 +353,8 @@ class DataciteImporter(EntityImporter):
         container = attributes.get('container', {}) or {}
         if container.get('type') in CONTAINER_TYPE_MAP.keys():
             container_type = CONTAINER_TYPE_MAP.get(container['type'])
-            if container.get('identifier') and container.get('identifierType') == 'ISSN':
+            if container.get('identifier') and container.get(
+                    'identifierType') == 'ISSN':
                 issn = container.get('identifier')
                 if len(issn) == 8:
                     issn = issn[:4] + "-" + issn[4:]
@@ -357,7 +366,8 @@ class DataciteImporter(EntityImporter):
                         container_title = container.get('title')
                         if isinstance(container_title, list):
                             if len(container_title) > 0:
-                                print('too many container titles: {}'.format(len(container_title)))
+                                print('too many container titles: {}'.format(
+                                    len(container_title)))
                                 container_title = container_title[0]
                         assert isinstance(container_title, str)
                         ce = fatcat_openapi_client.ContainerEntity(
@@ -404,7 +414,8 @@ class DataciteImporter(EntityImporter):
         # types supplied in datacite. The "attributes.types.resourceType"
         # contains too many (176 in sample) things for now; citeproc may be the
         # closest, but not always supplied.
-        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'):
+        for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg',
+                         'bibtex', 'ris'):
             value = attributes.get('types', {}).get(typeType)
             release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value)
             if release_type is not None:
@@ -442,19 +453,19 @@ class DataciteImporter(EntityImporter):
             if len(desc.get('description', '')) < 10:
                 continue
             text = desc.get('description')
-            sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest()
             lang = None
             if self.lang_detect:
                 try:
                     lang = langdetect.detect(text)
                 except langdetect.lang_detect_exception.LangDetectException as err:
-                    print('language detection failed: {}'.format(err), file=sys.stderr)
-            abstracts.append(fatcat_openapi_client.ReleaseAbstract(
-                mimetype="text/plain",
-                content=text,
-                sha1=sha1,
-                lang=lang,
-            ))
+                    print('language detection failed: {}'.format(err),
+                          file=sys.stderr)
+            abstracts.append(
+                fatcat_openapi_client.ReleaseAbstract(
+                    mimetype="text/plain",
+                    content=text,
+                    lang=lang,
+                ))
 
         # References and relations. Datacite include many relation types in
         # "attributes.relatedIdentifiers[].relationType", e.g.
@@ -476,17 +487,19 @@ class DataciteImporter(EntityImporter):
                 ref_extra['doi'] = rel.get('relatedIdentifier')
             if not ref_extra:
                 ref_extra = None
-            refs.append(fatcat_openapi_client.ReleaseRef(
-                index=ref_index,
-                extra=ref_extra,
-            ))
+            refs.append(
+                fatcat_openapi_client.ReleaseRef(
+                    index=ref_index,
+                    extra=ref_extra,
+                ))
             ref_index += 1
 
         # Start with clear stages, e.g. published. TODO(martin): we could
         # probably infer a bit more from the relations, e.g.
         # "IsPreviousVersionOf" or "IsNewVersionOf".
         release_stage = None
-        if attributes.get('state') == 'findable' or attributes.get('isActive') is True:
+        if attributes.get(
+                'state') == 'findable' or attributes.get('isActive') is True:
             release_stage = 'published'
 
         # Extra information.
@@ -496,8 +509,22 @@ class DataciteImporter(EntityImporter):
             extra_datacite['license'] = license_extra
         if attributes.get('subjects'):
             extra_datacite['subjects'] = attributes['subjects']
-        if attributes.get('url'):
-            extra_datacite['url'] = attributes['url']
+
+        # Include certain relations from relatedIdentifiers. Keeping the
+        # original structure of data here, which is a list of dicts, with
+        # relation type, identifer and identifier type (mostly).
+        relations = []
+        for rel in relIds:
+            if rel.get('relationType') in ('IsPartOf', 'Reviews', 'Continues',
+                                           'IsVariantFormOf', 'IsSupplementTo',
+                                           'HasVersion', 'IsMetadataFor',
+                                           'IsNewVersionOf', 'IsIdenticalTo',
+                                           'IsVersionOf', 'IsDerivedFrom',
+                                           'IsSourceOf'):
+                relations.append(rel)
+
+        if relations:
+            extra_datacite['relations'] = relations
 
         extra = dict()
 
@@ -515,7 +542,7 @@ class DataciteImporter(EntityImporter):
             release_stage=release_stage,
             title=title,
             subtitle=subtitle,
-            original_title=title,
+            original_title=original_language_title,
             release_year=release_year,
             release_date=release_date,
             publisher=publisher,
@@ -546,7 +573,7 @@ class DataciteImporter(EntityImporter):
         hide schema mismatch bugs.
         """
         if self.debug is True:
-            print(json.dumps(re.to_dict(), default=extended_json_encoder))
+            print(json.dumps(entity_to_dict(re, api_client=None)))
             return False
 
         # lookup existing DOI (don't need to try other ext idents for crossref)
@@ -572,24 +599,15 @@ class DataciteImporter(EntityImporter):
         if self.insert_log_file:
             with open(self.insert_log_file, 'a') as f:
                 for doc in batch:
-                    json.dump(doc.to_dict(), f, default=extended_json_encoder)
+                    json.dump(entity_to_dict(re, api_client=None), f)
                     f.write('\n')
-        self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(
-            editgroup=fatcat_openapi_client.Editgroup(
-            description=self.editgroup_description,
-            extra=self.editgroup_extra),
-            entity_list=batch))
+        self.api.create_release_auto_batch(
+            fatcat_openapi_client.ReleaseAutoBatch(
+                editgroup=fatcat_openapi_client.Editgroup(
+                    description=self.editgroup_description,
+                    extra=self.editgroup_extra),
+                entity_list=batch))
 
-def extended_json_encoder(value):
-    """
-    Can be used with json.dumps(value, default=extended_json_encoder) to serialize
-    value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage
-    """
-    if isinstance(value, (datetime.datetime, datetime.date)):
-        return value.isoformat()
-    if isinstance(value, set):
-        return list(value)
-    raise TypeError('cannot encode type: {}'.format(type(value)))
 
 def lookup_license_slug(raw):
     """
@@ -604,3 +622,156 @@ def lookup_license_slug(raw):
         if not raw.endswith('/'):
             raw = raw + '/'
     return LICENSE_SLUG_MAP.get(raw)
+
+
+def find_original_language_title(item, min_length=4, max_questionmarks=3):
+    """
+    Perform a few checks before returning a potential original language title.
+    """
+    if not 'original_language_title' in item:
+        return None
+    title = item.get('title')
+    if not title:
+        return None
+    original_language_title = item.get('original_language_title')
+    if isinstance(original_language_title,
+                  str) and title != original_language_title:
+        if len(original_language_title) < min_length:
+            return None
+        if original_language_title.count('?') > max_questionmarks:
+            return None
+        return original_language_title
+    if isinstance(original_language_title, dict):
+        content = original_language_title.get('__content__', '') or ''
+        if content and content != title and not content.count(
+                '?') > max_questionmarks:
+            return content
+    return None
+
+
+def parse_datacite_titles(titles):
+    """
+    Given a list of title items from datacite, return 3-tuple (title,
+    original_language_title, subtitle).
+
+    Example input:
+
+        [
+            {
+                 "title": "Meeting Heterogeneity in Consumer Demand"
+            }
+        ]
+    """
+    title, original_language_title, subtitle = None, None, None
+
+    if titles is None:
+        return title, original_language_title, subtitle
+    if len(titles) == 0:
+        return title, original_language_title, subtitle
+    elif len(titles) == 1:
+        original_language_title = find_original_language_title(titles[0])
+        title = titles[0].get('title', '') or ''
+        title = title.strip()
+        if not title:
+            title = None
+        return title, original_language_title, subtitle
+    else:
+        for entry in titles:
+            if not title and ('titleType' not in entry
+                              or not entry.get('titleType')):
+                title = entry.get('title').strip()
+            if not subtitle and entry.get('titleType') == 'Subtitle':
+                subtitle = entry.get('title', '').strip()
+            if not original_language_title:
+                original_language_title = find_original_language_title(entry)
+
+    return title, original_language_title, subtitle
+
+
+def parse_datacite_dates(dates):
+    """
+    Given a list of date fields (under .dates), return tuple, (release_date,
+    release_year).
+    """
+    release_date, release_year = None, None
+
+    if not dates:
+        return release_date, release_year
+
+    if not isinstance(dates, list):
+        raise ValueError('expected a list of date items')
+
+    # Ignored: Collected, Issued.
+    date_type_prio = (
+        'Valid',
+        'Available',
+        'Accepted',
+        'Submitted',
+        'Copyrighted',
+        'Created',
+        'Updated',
+    )
+
+    # Before using (expensive) dateparser, try a few common patterns.
+    common_patterns = ('%Y-%m-%d', '%Y-%m', '%Y-%m-%dT%H:%M:%SZ',
+                       '%Y-%m-%dT%H:%M:%S', '%Y')
+
+    def parse_item(item):
+        result, value, year_only = None, item.get('date', ''), False
+        release_date, release_year = None, None
+
+        for pattern in common_patterns:
+            try:
+                result = datetime.datetime.strptime(value, pattern)
+            except ValueError:
+                continue
+            else:
+                if pattern == '%Y':
+                    year_only = True
+                break
+
+        if result is None:
+            print('fallback for {}'.format(value), file=sys.stderr)
+            try:
+                result = dateparser.parse(value)
+            except TypeError as err:
+                print("{} date parsing failed with: {}".format(value, err),
+                      file=sys.stderr)
+                return result_date, result_year
+
+        if result is None:
+            # Unparsable date.
+            return release_date, release_year
+
+        if not year_only:
+            release_date = result.date()
+        release_year = result.year
+
+        return release_date, release_year
+
+    for prio in date_type_prio:
+        for item in dates:
+            if not item.get('dateType') == prio:
+                continue
+
+            release_date, release_year = parse_item(item)
+            if release_date is None and release_year is None:
+                continue
+
+            if release_year < 1000 or release_year > datetime.date.today(
+            ).year + 5:
+                # Skip possibly bogus dates.
+                release_year = None
+                continue
+            break
+        else:
+            continue
+        break
+
+    if release_date is None and release_year is None:
+        for item in dates:
+            release_date, release_year = parse_item(item)
+            if release_year or release_date:
+                break
+
+    return release_date, release_year
author	Martin Czygan <martin.czygan@gmail.com>	2019-12-21 23:30:56 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2019-12-28 23:07:31 +0100
commit	a196435a0e88f85785742cdd089344f97401b43a (patch)
tree	056dceaa4ccd567096b2d3e789efdd573682a8c3 /python/fatcat_tools/importers
parent	52eabd48658a676ac4577d1c8da31df1fe58093e (diff)
download	fatcat-a196435a0e88f85785742cdd089344f97401b43a.tar.gz fatcat-a196435a0e88f85785742cdd089344f97401b43a.zip