diff options
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 65 | 
1 files changed, 51 insertions, 14 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 00ce9ccd..0481337a 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -478,20 +478,7 @@ class DataciteImporter(EntityImporter):                  license_slug = slug              license_extra.append(lic) -        # Release type. Try to determine the release type from a variety of -        # types supplied in datacite. The "attributes.types.resourceType" is -        # uncontrolled (170000+ unique values, from "null", "Dataset" to -        # "Jupyter Notebook" and "Macroseismic Data Points" or "2 days of IP -        # flows in 2009") citeproc may be the closest, but not always supplied. -        # Order lookup roughly by completeness of mapping. -        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): -            value = attributes.get('types', {}).get(typeType) -            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) -            if release_type is not None: -                break - -        if release_type is None: -            print("[{}] no mapped type: {}".format(doi, value), file=sys.stderr) +        release_type = self.datacite_release_type(doi, attributes)          # Language values are varied ("ger", "es", "English", "ENG", "en-us",          # "other", ...). Try to crush it with langcodes: "It may sound to you @@ -682,6 +669,38 @@ class DataciteImporter(EntityImporter):          return re      @staticmethod +    def datacite_release_type(doi, attributes): +        """ +        Release type. Try to determine the release type from a variety of types +        supplied in datacite. The "attributes.types.resourceType" is +        uncontrolled (170000+ unique values, from "null", "Dataset" to "Jupyter +        Notebook" and "Macroseismic Data Points" or "2 days of IP flows in +        2009") citeproc may be the closest, but not always supplied.  Order +        lookup roughly by completeness of mapping. +        """ + +        release_type = None +        if not attributes.get('types'): +            return None +        types = attributes['types'] + +        for typeType in ('citeproc', 'ris', 'schemaOrg', 'bibtex', 'resourceTypeGeneral'): +            value = types.get(typeType) +            release_type = DATACITE_TYPE_MAP.get(typeType, {}).get(value) +            if release_type is not None: +                break + +        # special case: figshare "collections" which group other entities +        if doi.startswith('10.6084/') or doi.startswith('10.25384'): +            if types.get('resourceType') == "Collection": +                release_type = "stub" + +        if release_type is None: +            print("[{}] no mapped type: {}".format(doi, types), file=sys.stderr) + +        return release_type + +    @staticmethod      def biblio_hacks(re):          """          This function handles known special cases. For example, @@ -707,6 +726,24 @@ class DataciteImporter(EntityImporter):          if re.title.lower().startswith('additional file') and re.release_type in ('article', 'article-journal'):              re.release_type = 'component' +        # figshare +        if re.ext_ids.doi.startswith('10.6084/') or re.ext_ids.doi.startswith('10.25384'): +            # set version if DOI ends with versioned suffix +            doi_suffix = re.ext_ids.doi.split('.') +            if doi_suffix and doi_suffix.startswith('v') and doi_suffix[1:].isdigit(): +                re.version = doi_suffix +            # "Figure 123 from " -> component +            # "Table S1. ;Figure S1;Figure S2. ;Figure S3. ;Figure S4. from Use of organic exudates from two polar diatoms by bacterial isolates from the Arctic ocean" +            if " from " in re.title and re.release_type not in ('stub', 'graphic'): +                if re.title.startswith("Figure "): +                    re.release_type = "component" +                elif re.title.startswith("Table "): +                    re.release_type = "component" + +        # figshare.com +        if re.ext_ids.doi.startswith('10.6084/m9.figshare.') and re.container_name is None: +            re.container_name = "figshare.com" +          return re      def try_update(self, re): | 
