entity worker: ingest more releases

If release is a dataset or image, don't do a pdf ingest request. If release is a datacite DOI, and release_type is a "document", crawl regardless of is_oa detection. This is mostly to crawl repositories (institutional or subject).
author: Bryan Newbold <bnewbold@robocracy.org> 2020-02-22 14:57:44 -0800
committer: Bryan Newbold <bnewbold@robocracy.org> 2020-02-22 15:07:33 -0800
commit: dc4116f8ebd225eb4af1cecfc75f5c1291589694 (patch)
tree: e005a9fef1933a5ec02a42c33b44d3ecb17f9bb4 /python
parent: 2e8998d20d428820ba5bf9b21800e22ac1cdb41f (diff)
download: fatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.tar.gz
fatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.zip
1 files changed, 37 insertions, 1 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 745ee85a..b84d5e70 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -123,6 +123,33 @@ class EntityUpdatesWorker(FatcatWorker):
         ingest_type = ingest_request.get('ingest_type')
         doi = ingest_request.get('ext_ids', {}).get('doi')
 
+        is_document = release.release_type in (
+            'article-journal',
+            'paper-conference',
+            'article',
+            'report',
+            'chapter',
+            'manuscript',
+            'review',
+            'thesis',
+            'letter',
+            'editorial',
+            'abstract',
+            'entry',
+            'patent',
+            'post',
+            'review-book',
+        )
+        is_not_pdf = release.release_type in (
+            'dataset',
+            'stub',
+            'software',
+            'figure',
+            'graphic',
+        )
+
+        # accept list sets a default "crawl it" despite OA metadata for
+        # known-OA DOI prefixes
         in_acceptlist = False
         if doi:
             for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
@@ -131,9 +158,18 @@ class EntityUpdatesWorker(FatcatWorker):
 
         if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
             es = release_to_elasticsearch(release)
-            if not es['is_oa'] and not in_acceptlist:
+            # most datacite documents are in IRs and should be crawled
+            is_datacite_doc = False
+            if release.extra and ('datacite' in release.extra) and is_document:
+                is_datacite_doc = True
+            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                 return False
 
+        # if ingest_type is pdf but release_type is almost certainly not a PDF,
+        # skip it. This is mostly a datacite thing.
+        if ingest_type == "pdf" and is_not_pdf:
+            return False
+
         if ingest_type == "pdf" and doi:
             for prefix in self.ingest_pdf_doi_prefix_blocklist:
                 if doi.startswith(prefix):
author	Bryan Newbold <bnewbold@robocracy.org>	2020-02-22 14:57:44 -0800
committer	Bryan Newbold <bnewbold@robocracy.org>	2020-02-22 15:07:33 -0800
commit	dc4116f8ebd225eb4af1cecfc75f5c1291589694 (patch)
tree	e005a9fef1933a5ec02a42c33b44d3ecb17f9bb4 /python
parent	2e8998d20d428820ba5bf9b21800e22ac1cdb41f (diff)
download	fatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.tar.gz fatcat-dc4116f8ebd225eb4af1cecfc75f5c1291589694.zip