diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2019-12-17 17:38:45 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2019-12-28 23:07:31 +0100 | 
| commit | 76d6d4d2de6580ae147e40c43c18f04cc48b62ec (patch) | |
| tree | 40bed440966209df206b7c7e4e9ccd1d807719f8 | |
| parent | 68a051abc45103f21284163d13c8893c31b4e8e4 (diff) | |
| download | fatcat-76d6d4d2de6580ae147e40c43c18f04cc48b62ec.tar.gz fatcat-76d6d4d2de6580ae147e40c43c18f04cc48b62ec.zip | |
datacite: add missing mappings and notes
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 441 | 
1 files changed, 175 insertions, 266 deletions
| diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index e486ba90..4e117dde 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -1,5 +1,5 @@  """ -WIP: Importer for datacite.org data. +Prototype Importer for datacite.org data.  Example doc at: https://gist.github.com/miku/5610a2d64e3fee82d16f5d3f3a295fc8  """ @@ -8,9 +8,11 @@ from .common import EntityImporter  import dateparser  import langcodes  import datetime +import langdetect  import fatcat_openapi_client  import json  import sys +import hashlib  # https://guide.fatcat.wiki/entity_container.html#container_type-vocabulary  CONTAINER_TYPE_MAP = { @@ -20,40 +22,8 @@ CONTAINER_TYPE_MAP = {  }  # The docs/guide should be the cannonical home for these mappings; update there -# first. -# -# > select count(*), release_type from release_rev group by release_type order by count(*) desc; -# -#   count   |   release_type -# ----------+------------------- -#  95030004 | article-journal -#  13477878 | chapter -#   5926811 | paper-conference -#   2169642 | article -#   1806415 | dataset -#   1548614 | book -#   1390304 | -#    818351 | report -#    815684 | entry -#    307998 | standard -#    297769 | thesis -#    261426 | letter -#    148093 | post -#    122736 | editorial -#     99225 | stub -#     96219 | review-book -#     22854 | peer_review -#     19078 | interview -#     16278 | article-newspaper -#      3973 | speech -#      3536 | legal_case -#      2264 | abstract -#      1626 | legislation -#      1053 | retraction -#        85 | component -# (25 rows) -# -# Map various datacite type types to CSL-ish types. None means TODO or remove. +# first.  Map various datacite type types to CSL-ish types. None means TODO or +# remove.  DATACITE_TYPE_MAP = {      'ris': {          'THES': 'thesis', @@ -197,91 +167,17 @@ class DataciteImporter(EntityImporter):      def parse_record(self, obj):          """ -        TODO(martin): Map datacite to RE. - -        WIP, notes: - -        * Many subjects, should they end up in extra? -        * attributes.creators and attributes.contributors - -        $ jq '.attributes.creators[]?.nameType?' datacite.500k | sort | uniq -c | sort -nr -        3963663 "Personal" -        289795 null -        8892 "Organizational" - -        Shall we use issued, available? - -          { -            "date": "2011-11-18", -            "dateType": "Accepted" -          }, -          { -            "date": "2011-11-18", -            "dateType": "Available" -          }, -          { -            "date": "2011-11-07", -            "dateType": "Copyrighted" -          }, -          { -            "date": "2011-11-18", -            "dateType": "Issued" -          }, -          { -            "date": "2011-11-07", -            "dateType": "Issued" -          } - -        TODO(martin): Quick analysis of dates and stages. +        Mapping datacite JSON to ReleaseEntity.          """ -          if 'attributes' not in obj:              return None          attributes = obj['attributes'] -        # > Contributors -        # -        #  "attributes.creators[].contributorType": [ -        #    "author" -        #  ], -        #  "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ -        #    "LCNA", -        #    "GND", -        #    "email", -        #    "NAF", -        #    "OSF", -        #    "RRID", -        #    "ORCID", -        #    "SCOPUS", -        #    "NRCPID", -        #    "schema.org", -        #    "GRID", -        #    "MGDS", -        #    "VIAF", -        #    "JACoW-ID" -        #  ], -        # -        #    "https://orcid.org/0000-0002-9902-738X", -        #    "http://jacow.org/JACoW-00001280", -        #    "Wiebe_Peter", -        #    "https://osf.io/https://osf.io/kjfuy/", -        #    "http://www.viaf.org176549220", -        #    "2239", -        #    "Jeffries_Martin", -        #    "https://orcid.org/0000-0002-1493-6630", -        #    "0000-0002-6233-612X", -        # -        # "creators": [ -        #   { -        #     "name": "Bögli, Hans", -        #     "nameType": "Personal", -        #     "givenName": "Hans", -        #     "familyName": "Bögli", -        #     "affiliation": [] -        #   } -        # ], - +        # Contributors. Many nameIdentifierSchemes, we do not use yet: +        # "attributes.creators[].nameIdentifiers[].nameIdentifierScheme": [ +        # "LCNA", "GND", "email", "NAF", "OSF", "RRID", "ORCID", "SCOPUS", +        # "NRCPID", "schema.org", "GRID", "MGDS", "VIAF", "JACoW-ID" ],          contribs = []          for i, c in enumerate(attributes['creators']): @@ -304,15 +200,8 @@ class DataciteImporter(EntityImporter):                  surname=c.get('familyName'),              )) -        # > Title -        # -        #   "attributes.titles[].titleType": [ -        #     "AlternativeTitle", -        #     "Other", -        #     "Subtitle", -        #     null, -        #     "TranslatedTitle" -        #   ], +        # Title, may come with "attributes.titles[].titleType", like +        # "AlternativeTitle", "Other", "Subtitle", "TranslatedTitle"          title, subtitle = None, None          for entry in attributes.get('titles', []): @@ -321,22 +210,13 @@ class DataciteImporter(EntityImporter):              if entry.get('titleType') == 'Subtitle':                  subtitle = entry.get('title').strip() -        # > Dates -        # -        #  "attributes.dates[].dateType": [ -        #    "Accepted", -        #    "Available" -        #    "Collected", -        #    "Copyrighted", -        #    "Created", -        #    "Issued", -        #    "Submitted", -        #    "Updated", -        #    "Valid", -        #  ], -        # -        # Different documents have different dates defined. Choose the topmost -        # available from prio list. +        # Dates. A few internal dates (registered, created, updated) and +        # published (0..2554). We try to work with typed date list, in +        # "attributes.dates[].dateType", values: "Accepted", "Available" +        # "Collected", "Copyrighted", "Created", "Issued", "Submitted", +        # "Updated", "Valid". +        release_year, release_date = None, None +          date_type_prio = (              'Valid',              'Issued', @@ -348,14 +228,16 @@ class DataciteImporter(EntityImporter):              'Created',              'Updated',          ) - -        release_year, release_date = None, None          for prio in date_type_prio:              dates = attributes.get('dates', []) or [] # Never be None.              for item in dates:                  if not item.get('dateType') == prio:                      continue -                result = dateparser.parse(item.get('date')) +                try: +                    result = dateparser.parse(item.get('date')) +                except TypeError as err: +                    print("{} failed with: {}".format(item.get('date'), err), file=sys.stderr) +                    continue                  if result is None:                      # Unparsable date.                      continue @@ -369,56 +251,23 @@ class DataciteImporter(EntityImporter):                  continue              break -        # > Publisher -        # -        # A few NA values. A few bogus values. -        # +        # Publisher. A few NA values. A few bogus values.          publisher = attributes.get('publisher') -        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)'): +        if publisher in ('(:unav)', 'Unknown', 'n.a.', '[s.n.]', '(:unap)', '(:none)'):              publisher = None          if publisher is not None and len(publisher) > 80: -            # Arbitrary magic value, TODO(martin): better heuristic. -            # Example: "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, -            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / Unbekannt, -            # Nutzungsrechte müssen durch den Nutzer abgeklärt werden", -            # TODO(martin): log misses. +            # Arbitrary magic value max length. TODO(martin): better heuristic, +            # but factored out; first we have to log misses. Example: +            # "ETH-Bibliothek Zürich, Bildarchiv / Fotograf: Feller, +            # Elisabeth, Empfänger, Unbekannt, Fotograf / Fel_041033-RE / +            # Unbekannt, Nutzungsrechte müssen durch den Nutzer abgeklärt +            # werden"              publisher = None -        # > Container -        # -        # For the moment, only ISSN as container. -        # -        #    "container": { -        #      "type": "Journal", -        #      "issue": "8", -        #      "title": "Angewandte Chemie International Edition", -        #      "volume": "57", -        #      "lastPage": "2080", -        #      "firstPage": "2077", -        #      "identifier": "14337851", -        #      "identifierType": "ISSN" -        #    }, -        # -        #  "attributes.container.type": [ -        #    "DataRepository", -        #    "Journal", -        #    "Series", -        #    "Book Series" -        #  ], -        # -        #  "attributes.container.identifierType": [ -        #    "Handle", -        #    "ISBN", -        #    "LISSN", -        #    "DOI", -        #    "EISSN", -        #    "URL", -        #    "ISSN" -        #  ], -        # - +        # Container. For the moment, only ISSN as container.          container_id = None +          container = attributes.get('container', {}) or {}          if container.get('type') in CONTAINER_TYPE_MAP.keys():              container_type = CONTAINER_TYPE_MAP.get(container['type']) @@ -440,142 +289,202 @@ class DataciteImporter(EntityImporter):                          container_id = ce_edit.ident                          self._issnl_id_map[issnl] = container_id -        # > License -        # -        # attributes.rightsList[].rightsUri -        # attributes.rightsList[].rights -        # attributes.rightsList[].lang -        # +        # Volume and issue. +        volume = container.get('volume') +        issue = container.get('issue') + +        # Pages. +        pages = None + +        first_page = container.get('firstPage') +        last_page = container.get('lastPage') + +        if first_page and last_page: +            try: +                int(first_page) < int(last_page) +                pages = '{}-{}'.format(first_page, last_page) +            except ValueError as err: +                print(err, file=sys.stderr) +                pass + +        if not pages and first_page: +            pages = first_page +        # License.          license_slug = None          license_extra = [] +          for l in attributes.get('rightsList', []):              slug = lookup_license_slug(l.get('rightsUri'))              if slug:                  license_slug = slug              license_extra.append(l) -        # > Release type. -        # -        # Datacite has some fine granular typing (e.g. "Supplementary -        # Collection of Datasets", "Taxonomic treatment", "blog_entry", ... -        # -        # Additional, coarse: resourceTypeGeneral -        # -        #  "attributes.types.resourceTypeGeneral": [ -        #    "Image", -        #    "Dataset", -        #    "PhysicalObject", -        #    "Collection", -        #    "Text", -        #    "Sound", -        #    "InteractiveResource", -        #    "Event", -        #    "Software", -        #    "Other", -        #    "Workflow", -        #    "Audiovisual" -        #  ], -        #  "attributes.types.citeproc": [ -        #    "dataset", -        #    "chapter", -        #    "article-journal", -        #    "song", -        #    "article", -        #    "report", -        #    "graphic", -        #    "thesis", -        #    "book" -        #  ], -        # -        # There is RIS, also. - -        # attributes.types.resourceType contains too many things for now. +        # Release type. Try to determine the release type from a variety of +        # types supplied in datacite. The "attributes.types.resourceType" +        # contains too many (176 in sample) things for now; citeproc may be the +        # closest, but not always supplied.          for typeType in ('citeproc', 'resourceTypeGeneral', 'schemaOrg', 'bibtex', 'ris'): -            release_type = attributes.get('types', {}).get(typeType) +            value = attributes.get('types', {}).get(typeType) +            release_type = DATACITE_TYPE_MAP.get(value)              if release_type is not None:                  break -        # TODO(martin): Skip unmapped release_type entirely?          if release_type is None:              print("datacite unmapped type: {}".format(release_type), file=sys.stderr) -        # > Language. -        # attributes.language - +        # Language values are varied ("ger", "es", "English", "ENG", "en-us", +        # "other", ...). Try to crush it with langcodes: "It may sound to you +        # like langcodes solves a pretty boring problem. At one level, that's +        # right. Sometimes you have a boring problem, and it's great when a +        # library solves it for you." -- TODO(martin): We need more of these.          language = None -        value = attributes.get('language', '') or '' # As it is written. + +        value = attributes.get('language', '') or ''          try:              language = langcodes.find(value).language          except LookupError:              try:                  language = langcodes.get(value).language              except langcodes.tag_parser.LanguageTagError: +                print('could not determine language: {}'.format(value), file=sys.stderr) + +        # Abstracts appear in "attributes.descriptions[].descriptionType", some +        # of the observed values: "Methods", "TechnicalInfo", +        # "SeriesInformation", "Other", "TableOfContents", "Abstract". The +        # "Other" fields might contain references or related articles (with +        # DOI). TODO(martin): maybe try to parse out some of those refs. +        abstracts = [] + +        for desc in attributes.get('descriptions', []): +            if not desc.get('descriptionType') == 'Abstract': +                continue +            if len(desc.get('description', '')) < 10: +                continue +            text = desc.get('description') +            sha1 = hashlib.sha1(text.encode('utf-8')).hexdigest() +            lang = None +            try: +                lang = langdetect.detect(text) +            except langdetect.lang_detect_exception.LangDetectException:                  pass +            abstracts.append(fatcat_openapi_client.ReleaseAbstract( +                mimetype="text/plain", +                content=text, +                sha1=sha1, +                lang=lang, +            )) + +        # References and relations. Datacite include many relation types in +        # "attributes.relatedIdentifiers[].relationType", e.g. +        # "IsPartOf", "IsPreviousVersionOf", "Continues", "IsVariantFormOf", +        # "IsSupplementTo", "Cites", "IsSupplementedBy", "IsDocumentedBy", "HasVersion", +        # "IsCitedBy", "IsMetadataFor", "IsNewVersionOf", "IsIdenticalTo", "HasPart", +        # "References", "Reviews", "HasMetadata", "IsContinuedBy", "IsVersionOf", +        # "IsDerivedFrom", "IsSourceOf". +        # +        # For the moment, we only care about References. +        refs, ref_index = [], 0 + +        for rel in attributes.get('relatedIdentifiers', []): +            if not rel.get('relationType') == 'References': +                continue +            ref_extra = dict() +            if rel.get('relatedIdentifierType') == 'DOI': +                ref_extra['doi'] = rel.get('relatedIdentifier') +            if not ref_extra: +                ref_extra = None +            refs.append(fatcat_openapi_client.ReleaseRef( +                index=ref_index, +                extra=ref_extra, +            )) +            ref_index += 1 + +        # Start with clear stages, e.g. published. TODO(martin): we could +        # probably infer a bit more from the relations, e.g. +        # "IsPreviousVersionOf" or "IsNewVersionOf". +        release_stage = None +        if attributes.get('state') == 'findable' or attributes.get('isActive') is True: +            release_stage = 'published' + +        # Extra information. +        extra_datacite = dict() -        # > Extra information: license, subjects, ... -        extra, extra_datacite = dict(), dict()          if license_extra: -            extra_datacite = { -                'license': license_extra, -            } +            extra_datacite['license'] = license_extra          if attributes.get('subjects'): -            extra_datacite['subjects'] = attributes.get('subjects', []) +            extra_datacite['subjects'] = attributes['subjects'] +        if attributes.get('url'): +            extra_datacite['url'] = attributes['url'] + +        extra = dict()          if extra_datacite:              extra['datacite'] = extra_datacite -        # https://guide.fatcat.wiki/entity_release.html +        # Assemble release.          re = fatcat_openapi_client.ReleaseEntity(              work_id=None,              container_id=container_id,              release_type=release_type, -            release_stage=None, -            title=title, # attributes.titles, various titleType +            release_stage=release_stage, +            title=title,              subtitle=subtitle, -            original_title=title, # AlternativeTitle? -            release_year=release_year, # publicationYear -            release_date=release_date, # date issues/available? -            publisher=publisher, # attributes.publisher +            original_title=title, +            release_year=release_year, +            release_date=release_date, +            publisher=publisher,              ext_ids=fatcat_openapi_client.ReleaseExtIds( -                doi=attributes.get('doi'), # attributes.doi, -                # Can we add handle.net link? +                doi=attributes.get('doi'),              ),              contribs=contribs, -            volume=None, -            issue=None, -            pages=None, +            volume=volume, +            issue=issue, +            pages=pages,              language=language, -            abstracts=None, -            refs=None, +            abstracts=abstracts, +            refs=refs,              extra=extra,              license_slug=license_slug,          )          return re      def try_update(self, re, debug=True): +        """ +        When debug is true, write the RE to stdout. +        """          if debug is True: -            # print(type(re)) -            print(json.dumps(re.to_dict(), default=extended_encoder)) -            return -        return False +            print(json.dumps(re.to_dict(), default=extended_json_encoder)) +            return False -    def insert_batch(self, batch): -        # Debugging. -        for item in batch: -            print(item) -        return +        # lookup existing DOI (don't need to try other ext idents for crossref) +        existing = None +        try: +            existing = self.api.lookup_release(doi=re.ext_ids.doi) +        except fatcat_openapi_client.rest.ApiException as err: +            if err.status != 404: +                raise err +            # doesn't exist, need to update +            return True -        # Orig. +        # eventually we'll want to support "updates", but for now just skip if +        # entity already exists +        if existing: +            self.counts['exists'] += 1 +            return False + +        return True + +    def insert_batch(self, batch):          self.api.create_release_auto_batch(fatcat_openapi_client.ReleaseAutoBatch(              editgroup=fatcat_openapi_client.Editgroup(                  description=self.editgroup_description,                  extra=self.editgroup_extra),              entity_list=batch)) -def extended_encoder(value): +def extended_json_encoder(value):      """ -    Can be used with json.dumps(value, default=extended_encoder) to serialize +    Can be used with json.dumps(value, default=extended_json_encoder) to serialize      value not serializable by default. https://docs.python.org/3/library/json.html#basic-usage      """      if isinstance(value, (datetime.datetime, datetime.date)): @@ -585,7 +494,7 @@ def extended_encoder(value):  def lookup_license_slug(raw):      """ -    TODO(martin): reuse from crossref, maybe. +    TODO(martin): reuse from or combine with crossref, maybe.      """      if not raw:          return None | 
