aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-02-22 14:57:44 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-02-22 15:07:33 -0800
commitdc4116f8ebd225eb4af1cecfc75f5c1291589694 (patch)
treee005a9fef1933a5ec02a42c33b44d3ecb17f9bb4 /python
parent2e8998d20d428820ba5bf9b21800e22ac1cdb41f (diff)
downloadfatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.tar.gz
fatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.zip
entity worker: ingest more releases
If release is a dataset or image, don't do a pdf ingest request. If release is a datacite DOI, and release_type is a "document", crawl regardless of is_oa detection. This is mostly to crawl repositories (institutional or subject).
Diffstat (limited to 'python')
-rw-r--r--python/fatcat_tools/workers/changelog.py38
1 files changed, 37 insertions, 1 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 745ee85a..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -123,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker):
ingest_type = ingest_request.get('ingest_type')
doi = ingest_request.get('ext_ids', {}).get('doi')
+ is_document = release.release_type in (
+ 'article-journal',
+ 'paper-conference',
+ 'article',
+ 'report',
+ 'chapter',
+ 'manuscript',
+ 'review',
+ 'thesis',
+ 'letter',
+ 'editorial',
+ 'abstract',
+ 'entry',
+ 'patent',
+ 'post',
+ 'review-book',
+ )
+ is_not_pdf = release.release_type in (
+ 'dataset',
+ 'stub',
+ 'software',
+ 'figure',
+ 'graphic',
+ )
+
+ # accept list sets a default "crawl it" despite OA metadata for
+ # known-OA DOI prefixes
in_acceptlist = False
if doi:
for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
@@ -131,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker):
if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
es = release_to_elasticsearch(release)
- if not es['is_oa'] and not in_acceptlist:
+ # most datacite documents are in IRs and should be crawled
+ is_datacite_doc = False
+ if release.extra and ('datacite' in release.extra) and is_document:
+ is_datacite_doc = True
+ if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
return False
+ # if ingest_type is pdf but release_type is almost certainly not a PDF,
+ # skip it. This is mostly a datacite thing.
+ if ingest_type == "pdf" and is_not_pdf:
+ return False
+
if ingest_type == "pdf" and doi:
for prefix in self.ingest_pdf_doi_prefix_blocklist:
if doi.startswith(prefix):