diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 12 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/datacite.py | 12 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 1 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 15 | 
5 files changed, 36 insertions, 11 deletions
| diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 99c330a6..eafc6546 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -458,7 +458,8 @@ class EntityImporter:              creator_id = rv.ident          except ApiException as ae:              # If anything other than a 404 (not found), something is wrong -            assert ae.status == 404 +            if ae.status != 404: +                raise ae          self._orcid_id_map[orcid] = creator_id # might be None          return creator_id @@ -479,7 +480,8 @@ class EntityImporter:              release_id = rv.ident          except ApiException as ae:              # If anything other than a 404 (not found), something is wrong -            assert ae.status == 404 +            if ae.status != 404: +                raise ae          self._doi_id_map[doi] = release_id # might be None          return release_id @@ -495,7 +497,8 @@ class EntityImporter:              release_id = rv.ident          except ApiException as ae:              # If anything other than a 404 (not found), something is wrong -            assert ae.status == 404 +            if ae.status != 404: +                raise ae          self._pmid_id_map[pmid] = release_id # might be None          return release_id @@ -512,7 +515,8 @@ class EntityImporter:              container_id = rv.ident          except ApiException as ae:              # If anything other than a 404 (not found), something is wrong -            assert ae.status == 404 +            if ae.status != 404: +                raise ae          self._issnl_id_map[issnl] = container_id # might be None          return container_id diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 81f00876..d998f266 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -496,10 +496,12 @@ class DataciteImporter(EntityImporter):              if not desc.get('descriptionType') == 'Abstract':                  continue -            # Description maybe a string or list. +            # Description maybe a string, int or list.              text = desc.get('description', '')              if not text:                  continue +            if isinstance(text, int): +                text = '{}'.format(text)              if isinstance(text, list):                  try:                      text = "\n".join(text) @@ -758,6 +760,14 @@ class DataciteImporter(EntityImporter):                      given_name = clean(given_name)                  if surname:                      surname = clean(surname) + +                # Perform a final assertion that name does not reduce to zero +                # (e.g. whitespace only name). +                if name: +                    name = name.strip() +                if not name: +                    continue +                  if raw_affiliation == '':                      continue diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4772bfaa..6cf1604b 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -19,6 +19,7 @@ class IngestFileResultImporter(EntityImporter):              editgroup_description=eg_desc,              editgroup_extra=eg_extra,              **kwargs) +        self.use_glutton_match = False          self.default_link_rel = kwargs.get("default_link_rel", "web")          assert self.default_link_rel          self.require_grobid = require_grobid @@ -107,9 +108,10 @@ class IngestFileResultImporter(EntityImporter):                      elif err.status == 400:                          self.counts['warn-extid-invalid'] += 1                          continue +                    raise err                  release_ident = release.ident                  break -        if not release_ident and row.get('grobid'): +        if self.use_glutton_match and not release_ident and row.get('grobid'):              # try biblio-glutton extracted hit              if row['grobid'].get('fatcat_release'):                  release_ident = row['grobid']['fatcat_release'].split('_')[-1] @@ -197,8 +199,7 @@ class IngestFileResultImporter(EntityImporter):          if not existing:              return True -        # the following checks all assume there is an existing item - +        # NOTE: the following checks all assume there is an existing item          if (fe.release_ids[0] in existing.release_ids) and existing.urls:              # TODO: could still, in theory update with the new URL?              self.counts['exists'] += 1 diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index abcb21d9..3d3e3a8c 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -782,6 +782,7 @@ class PubmedImporter(EntityImporter):                  # NOTE: API behavior might change in the future?                  if "release_edit_editgroup_id_ident_id_key" in err.body:                      self.counts['skip-update-conflict'] += 1 +                    return False                  else:                      raise err              finally: diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 525f372b..e58b3da1 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -19,7 +19,7 @@ class ElasticsearchReleaseWorker(FatcatWorker):      def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None,              elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", -            batch_size=200): +            batch_size=200, api_host="https://api.fatcat.wiki/v0"):          super().__init__(kafka_hosts=kafka_hosts,                           consume_topic=consume_topic)          self.consumer_group = "elasticsearch-updates3" @@ -30,9 +30,11 @@ class ElasticsearchReleaseWorker(FatcatWorker):          self.entity_type = ReleaseEntity          self.elasticsearch_document_name = "release"          self.transform_func = release_to_elasticsearch +        self.api_host = api_host      def run(self):          ac = ApiClient() +        api = public_api(self.api_host)          def fail_fast(err, partitions):              if err is not None: @@ -103,13 +105,20 @@ class ElasticsearchReleaseWorker(FatcatWorker):                      if entity_dict.get('name') and not entity_dict.get('title'):                          continue                  entity = entity_from_json(json_str, self.entity_type, api_client=ac) +                if self.elasticsearch_document_name == "changelog": +                    key = entity.index +                    # might need to fetch from API +                    if not (entity.editgroup and entity.editgroup.editor): +                        entity = api.get_changelog_entry(entity.index) +                else: +                    key = entity.ident                  # TODO: handle deletions from index                  bulk_actions.append(json.dumps({ -                    "index": { "_id": entity.ident, }, +                    "index": { "_id": key, },                  }))                  bulk_actions.append(json.dumps(                      self.transform_func(entity))) -            print("Upserting, eg, {} (of {} releases in elasticsearch)".format(entity.ident, len(batch))) +            print("Upserting, eg, {} (of {} {} in elasticsearch)".format(key, len(batch), self.elasticsearch_document_name))              elasticsearch_endpoint = "{}/{}/{}/_bulk".format(                  self.elasticsearch_backend,                  self.elasticsearch_index, | 
