summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-11 15:23:25 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-11 15:32:28 -0700
commit5eddc9b9aefbd7ae197d441b8a7af1fded940e2d (patch)
tree41b7821a91cce595bd26be3504c0ca6ba5582616 /python/fatcat_tools
parent2a492914082444690f853a55ab1394fc0cf50108 (diff)
downloadfatcat-5eddc9b9aefbd7ae197d441b8a7af1fded940e2d.tar.gz
fatcat-5eddc9b9aefbd7ae197d441b8a7af1fded940e2d.zip
entity update: default to ingest non-OA works
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r--python/fatcat_tools/workers/changelog.py19
1 files changed, 10 insertions, 9 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index dc5ef299..f7df6748 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -89,7 +89,7 @@ class EntityUpdatesWorker(FatcatWorker):
self.ingest_file_request_topic = ingest_file_request_topic
self.poll_interval = poll_interval
self.consumer_group = "entity-updates"
- self.ingest_oa_only = True
+ self.ingest_oa_only = False
self.ingest_pdf_doi_prefix_blocklist = [
# gbif.org: many DOIs, not PDF fulltext
"10.15468/",
@@ -191,15 +191,16 @@ class EntityUpdatesWorker(FatcatWorker):
'stub',
)
- # accept list sets a default "crawl it" despite OA metadata for
- # known-OA DOI prefixes
- in_acceptlist = False
- if doi:
- for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
- if doi.startswith(prefix):
- in_acceptlist = True
-
if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+
+ # accept list sets a default "crawl it" despite OA metadata for
+ # known-OA DOI prefixes
+ in_acceptlist = False
+ if doi:
+ for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+ if doi.startswith(prefix):
+ in_acceptlist = True
+
es = release_to_elasticsearch(release)
# most datacite documents are in IRs and should be crawled
is_datacite_doc = False