aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-08-11 15:45:36 -0700
committerBryan Newbold <bnewbold@robocracy.org>2020-08-11 15:45:39 -0700
commit03d2004717d36962aef1bd373d59ce799d7db9ab (patch)
tree95e1863476f0e6c4fa0c9b3232e34d024cba0f85
parenta95b382a7add348c15bca4ed98729e47b17df11a (diff)
downloadfatcat-03d2004717d36962aef1bd373d59ce799d7db9ab.tar.gz
fatcat-03d2004717d36962aef1bd373d59ce799d7db9ab.zip
entity update: change big5 ingest behavior
In addition to changing the OA default, this was the main intended behavior change in this group of commits: want to ingest fewer attempts that we *expect* to fail, but default to ingest/crawl attempt if we are uncertain. This is because there is a long tail of journals that register DOIs and are defacto OA (fulltext is available), but we don't have metadata indicating them as such.
-rw-r--r--python/fatcat_tools/workers/changelog.py24
1 files changed, 15 insertions, 9 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index f7df6748..65a8fcd8 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -159,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker):
link_source = ingest_request.get('ingest_request')
ingest_type = ingest_request.get('ingest_type')
doi = ingest_request.get('ext_ids', {}).get('doi')
+ es = release_to_elasticsearch(release)
is_document = release.release_type in (
'article',
@@ -191,17 +192,16 @@ class EntityUpdatesWorker(FatcatWorker):
'stub',
)
- if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
+ # accept list sets a default "crawl it" despite OA metadata for
+ # known-OA DOI prefixes
+ in_acceptlist = False
+ if doi:
+ for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
+ if doi.startswith(prefix):
+ in_acceptlist = True
- # accept list sets a default "crawl it" despite OA metadata for
- # known-OA DOI prefixes
- in_acceptlist = False
- if doi:
- for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
- if doi.startswith(prefix):
- in_acceptlist = True
+ if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
- es = release_to_elasticsearch(release)
# most datacite documents are in IRs and should be crawled
is_datacite_doc = False
if release.extra and ('datacite' in release.extra) and is_document:
@@ -209,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker):
if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
return False
+ # big publishers *generally* have accurate OA metadata, use
+ # preservation networks, and block our crawlers. So unless OA, or
+ # explicitly on accept list, or not preserved, skip crawling
+ if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist):
+ return False
+
# if ingest_type is pdf but release_type is almost certainly not a PDF,
# skip it. This is mostly a datacite thing.
if ingest_type == "pdf" and is_not_pdf: