diff options
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/importers/common.py | 12 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 12 | ||||
-rw-r--r-- | python/fatcat_tools/importers/ingest.py | 7 | ||||
-rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 1 | ||||
-rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 15 |
5 files changed, 36 insertions, 11 deletions
diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index 99c330a6..eafc6546 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -458,7 +458,8 @@ class EntityImporter: creator_id = rv.ident except ApiException as ae: # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 + if ae.status != 404: + raise ae self._orcid_id_map[orcid] = creator_id # might be None return creator_id @@ -479,7 +480,8 @@ class EntityImporter: release_id = rv.ident except ApiException as ae: # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 + if ae.status != 404: + raise ae self._doi_id_map[doi] = release_id # might be None return release_id @@ -495,7 +497,8 @@ class EntityImporter: release_id = rv.ident except ApiException as ae: # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 + if ae.status != 404: + raise ae self._pmid_id_map[pmid] = release_id # might be None return release_id @@ -512,7 +515,8 @@ class EntityImporter: container_id = rv.ident except ApiException as ae: # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 + if ae.status != 404: + raise ae self._issnl_id_map[issnl] = container_id # might be None return container_id diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index 81f00876..d998f266 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -496,10 +496,12 @@ class DataciteImporter(EntityImporter): if not desc.get('descriptionType') == 'Abstract': continue - # Description maybe a string or list. + # Description maybe a string, int or list. text = desc.get('description', '') if not text: continue + if isinstance(text, int): + text = '{}'.format(text) if isinstance(text, list): try: text = "\n".join(text) @@ -758,6 +760,14 @@ class DataciteImporter(EntityImporter): given_name = clean(given_name) if surname: surname = clean(surname) + + # Perform a final assertion that name does not reduce to zero + # (e.g. whitespace only name). + if name: + name = name.strip() + if not name: + continue + if raw_affiliation == '': continue diff --git a/python/fatcat_tools/importers/ingest.py b/python/fatcat_tools/importers/ingest.py index 4772bfaa..6cf1604b 100644 --- a/python/fatcat_tools/importers/ingest.py +++ b/python/fatcat_tools/importers/ingest.py @@ -19,6 +19,7 @@ class IngestFileResultImporter(EntityImporter): editgroup_description=eg_desc, editgroup_extra=eg_extra, **kwargs) + self.use_glutton_match = False self.default_link_rel = kwargs.get("default_link_rel", "web") assert self.default_link_rel self.require_grobid = require_grobid @@ -107,9 +108,10 @@ class IngestFileResultImporter(EntityImporter): elif err.status == 400: self.counts['warn-extid-invalid'] += 1 continue + raise err release_ident = release.ident break - if not release_ident and row.get('grobid'): + if self.use_glutton_match and not release_ident and row.get('grobid'): # try biblio-glutton extracted hit if row['grobid'].get('fatcat_release'): release_ident = row['grobid']['fatcat_release'].split('_')[-1] @@ -197,8 +199,7 @@ class IngestFileResultImporter(EntityImporter): if not existing: return True - # the following checks all assume there is an existing item - + # NOTE: the following checks all assume there is an existing item if (fe.release_ids[0] in existing.release_ids) and existing.urls: # TODO: could still, in theory update with the new URL? self.counts['exists'] += 1 diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index abcb21d9..3d3e3a8c 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -782,6 +782,7 @@ class PubmedImporter(EntityImporter): # NOTE: API behavior might change in the future? if "release_edit_editgroup_id_ident_id_key" in err.body: self.counts['skip-update-conflict'] += 1 + return False else: raise err finally: diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 525f372b..e58b3da1 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -19,7 +19,7 @@ class ElasticsearchReleaseWorker(FatcatWorker): def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat", - batch_size=200): + batch_size=200, api_host="https://api.fatcat.wiki/v0"): super().__init__(kafka_hosts=kafka_hosts, consume_topic=consume_topic) self.consumer_group = "elasticsearch-updates3" @@ -30,9 +30,11 @@ class ElasticsearchReleaseWorker(FatcatWorker): self.entity_type = ReleaseEntity self.elasticsearch_document_name = "release" self.transform_func = release_to_elasticsearch + self.api_host = api_host def run(self): ac = ApiClient() + api = public_api(self.api_host) def fail_fast(err, partitions): if err is not None: @@ -103,13 +105,20 @@ class ElasticsearchReleaseWorker(FatcatWorker): if entity_dict.get('name') and not entity_dict.get('title'): continue entity = entity_from_json(json_str, self.entity_type, api_client=ac) + if self.elasticsearch_document_name == "changelog": + key = entity.index + # might need to fetch from API + if not (entity.editgroup and entity.editgroup.editor): + entity = api.get_changelog_entry(entity.index) + else: + key = entity.ident # TODO: handle deletions from index bulk_actions.append(json.dumps({ - "index": { "_id": entity.ident, }, + "index": { "_id": key, }, })) bulk_actions.append(json.dumps( self.transform_func(entity))) - print("Upserting, eg, {} (of {} releases in elasticsearch)".format(entity.ident, len(batch))) + print("Upserting, eg, {} (of {} {} in elasticsearch)".format(key, len(batch), self.elasticsearch_document_name)) elasticsearch_endpoint = "{}/{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index, |