From 03d2004717d36962aef1bd373d59ce799d7db9ab Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Tue, 11 Aug 2020 15:45:36 -0700
Subject: entity update: change big5 ingest behavior

In addition to changing the OA default, this was the main intended
behavior change in this group of commits: want to ingest fewer attempts
that we *expect* to fail, but default to ingest/crawl attempt if we are
uncertain. This is because there is a long tail of journals that
register DOIs and are defacto OA (fulltext is available), but we don't
have metadata indicating them as such.
---
 python/fatcat_tools/workers/changelog.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index f7df6748..65a8fcd8 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -159,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker):
         link_source = ingest_request.get('ingest_request')
         ingest_type = ingest_request.get('ingest_type')
         doi = ingest_request.get('ext_ids', {}).get('doi')
+        es = release_to_elasticsearch(release)
 
         is_document = release.release_type in (
             'article',
@@ -191,17 +192,16 @@ class EntityUpdatesWorker(FatcatWorker):
             'stub',
         )
 
-        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+        # accept list sets a default "crawl it" despite OA metadata for
+        # known-OA DOI prefixes
+        in_acceptlist = False
+        if doi:
+            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+                if doi.startswith(prefix):
+                    in_acceptlist = True
 
-            # accept list sets a default "crawl it" despite OA metadata for
-            # known-OA DOI prefixes
-            in_acceptlist = False
-            if doi:
-                for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
-                    if doi.startswith(prefix):
-                        in_acceptlist = True
+        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
 
-            es = release_to_elasticsearch(release)
             # most datacite documents are in IRs and should be crawled
             is_datacite_doc = False
             if release.extra and ('datacite' in release.extra) and is_document:
@@ -209,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker):
             if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                 return False
 
+        # big publishers *generally* have accurate OA metadata, use
+        # preservation networks, and block our crawlers. So unless OA, or
+        # explicitly on accept list, or not preserved, skip crawling
+        if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist):
+            return False
+
         # if ingest_type is pdf but release_type is almost certainly not a PDF,
         # skip it. This is mostly a datacite thing.
         if ingest_type == "pdf" and is_not_pdf:
-- 
cgit v1.2.3