diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-11 15:23:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-11 15:32:28 -0700 |
commit | 5eddc9b9aefbd7ae197d441b8a7af1fded940e2d (patch) | |
tree | 41b7821a91cce595bd26be3504c0ca6ba5582616 /python/fatcat_tools | |
parent | 2a492914082444690f853a55ab1394fc0cf50108 (diff) | |
download | fatcat-5eddc9b9aefbd7ae197d441b8a7af1fded940e2d.tar.gz fatcat-5eddc9b9aefbd7ae197d441b8a7af1fded940e2d.zip |
entity update: default to ingest non-OA works
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 19 |
1 files changed, 10 insertions, 9 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index dc5ef299..f7df6748 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker): self.ingest_file_request_topic = ingest_file_request_topic self.poll_interval = poll_interval self.consumer_group = "entity-updates" - self.ingest_oa_only = True + self.ingest_oa_only = False self.ingest_pdf_doi_prefix_blocklist = [ # gbif.org: many DOIs, not PDF fulltext "10.15468/", @@ -191,15 +191,16 @@ class EntityUpdatesWorker(FatcatWorker): 'stub', ) - # accept list sets a default "crawl it" despite OA metadata for - # known-OA DOI prefixes - in_acceptlist = False - if doi: - for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: - if doi.startswith(prefix): - in_acceptlist = True - if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): + + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes + in_acceptlist = False + if doi: + for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: + if doi.startswith(prefix): + in_acceptlist = True + es = release_to_elasticsearch(release) # most datacite documents are in IRs and should be crawled is_datacite_doc = False |