diff options
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/workers/changelog.py | 37 | 
1 files changed, 34 insertions, 3 deletions
| diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 863ad40a..353eca8f 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -3,7 +3,8 @@ import json  import time  from confluent_kafka import Consumer, Producer, KafkaException -from fatcat_tools.transforms import release_ingest_request +from fatcat_tools.transforms import release_ingest_request, release_to_elasticsearch +  from .worker_common import FatcatWorker, most_recent_message @@ -89,6 +90,36 @@ class EntityUpdatesWorker(FatcatWorker):          self.poll_interval = poll_interval          self.consumer_group = "entity-updates"          self.ingest_oa_only = True +        self.ingest_pdf_doi_prefix_blocklist = [ +            # gbif.org: many DOIs, not PDF fulltext +            "10.15468/", +        ] + +    def want_live_ingest(self, release, ingest_request): +        """ +        This function looks at ingest requests and decides whether they are +        worth enqueing for ingest. + +        In theory crawling all DOIs to a landing page is valuable.  It is +        intended to be an operational point of control to reduce load on daily +        ingest crawling (via wayback SPN). +        """ + +        link_source = ingest_request.get('ingest_request') +        ingest_type = ingest_request.get('ingest_type') + +        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): +            es = release_to_elasticsearch(release) +            if not es['is_oa']: +                return False + +        doi = ingest_request.get('ext_ids', {}).get('doi') +        if ingest_type == "pdf" and doi: +            for prefix in self.ingest_pdf_doi_prefix_blocklist: +                if doi.startswith(prefix): +                    return False + +        return True      def run(self): @@ -222,8 +253,8 @@ class EntityUpdatesWorker(FatcatWorker):                  )                  # filter to "new" active releases with no matched files                  if release.ident in new_release_ids: -                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog', oa_only=self.ingest_oa_only) -                    if ir and not release.files: +                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') +                    if ir and not release.files and self.want_live_ingest(release, ir):                          producer.produce(                              self.ingest_file_request_topic,                              json.dumps(ir).encode('utf-8'), | 
