diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/workers/__init__.py | 2 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 29 | ||||
| -rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 23 | 
3 files changed, 40 insertions, 14 deletions
| diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py index 8bea7cdc..32fd330d 100644 --- a/python/fatcat_tools/workers/__init__.py +++ b/python/fatcat_tools/workers/__init__.py @@ -1,4 +1,4 @@  from .changelog import ChangelogWorker, EntityUpdatesWorker -from .elasticsearch import ElasticsearchReleaseWorker, ElasticsearchContainerWorker +from .elasticsearch import ElasticsearchReleaseWorker, ElasticsearchContainerWorker, ElasticsearchChangelogWorker  from .worker_common import most_recent_message, FatcatWorker diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d1e7c2db..3a49f86e 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -153,28 +153,33 @@ class EntityUpdatesWorker(FatcatWorker):          doi = ingest_request.get('ext_ids', {}).get('doi')          is_document = release.release_type in ( -            'article-journal', -            'paper-conference',              'article', -            'report', +            'article-journal', +            'article-newspaper', +            'book',              'chapter', -            'manuscript', -            'review', -            'thesis', -            'letter',              'editorial', -            'abstract', -            'entry', +            'interview', +            'legal_case', +            'legislation', +            'letter', +            'manuscript', +            'paper-conference',              'patent', -            'post', +            'peer_review', +            'report', +            'retraction', +            'review',              'review-book', +            'thesis',          )          is_not_pdf = release.release_type in ( +            'component',              'dataset', -            'stub', -            'software',              'figure',              'graphic', +            'software', +            'stub',          )          # accept list sets a default "crawl it" despite OA metadata for diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 68d6c304..525f372b 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -4,7 +4,7 @@ import time  import requests  from confluent_kafka import Consumer, KafkaException -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient +from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient, ChangelogEntry  from fatcat_tools import *  from .worker_common import FatcatWorker @@ -148,3 +148,24 @@ class ElasticsearchContainerWorker(ElasticsearchReleaseWorker):          self.elasticsearch_document_name = "container"          self.transform_func = container_to_elasticsearch + +class ElasticsearchChangelogWorker(ElasticsearchReleaseWorker): +    """ +    Pulls changelog messages from Kafka, runs transformations and indexes them. + +    Note: Very early versions of changelog entries did not contain details +    about the editor or extra fields. +    """ +    def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, +            elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat_changelog", +            batch_size=200): +        super().__init__(kafka_hosts=kafka_hosts, +                         consume_topic=consume_topic) +        self.consumer_group = "elasticsearch-updates3" +        self.batch_size = batch_size +        self.poll_interval = poll_interval +        self.elasticsearch_backend = elasticsearch_backend +        self.elasticsearch_index = elasticsearch_index +        self.entity_type = ChangelogEntry +        self.elasticsearch_document_name = "changelog" +        self.transform_func = changelog_to_elasticsearch | 
