diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-11 15:45:36 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-08-11 15:45:39 -0700 |
commit | 03d2004717d36962aef1bd373d59ce799d7db9ab (patch) | |
tree | 95e1863476f0e6c4fa0c9b3232e34d024cba0f85 /python | |
parent | a95b382a7add348c15bca4ed98729e47b17df11a (diff) | |
download | fatcat-03d2004717d36962aef1bd373d59ce799d7db9ab.tar.gz fatcat-03d2004717d36962aef1bd373d59ce799d7db9ab.zip |
entity update: change big5 ingest behavior
In addition to changing the OA default, this was the main intended
behavior change in this group of commits: want to ingest fewer attempts
that we *expect* to fail, but default to ingest/crawl attempt if we are
uncertain. This is because there is a long tail of journals that
register DOIs and are defacto OA (fulltext is available), but we don't
have metadata indicating them as such.
Diffstat (limited to 'python')
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 24 |
1 files changed, 15 insertions, 9 deletions
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index f7df6748..65a8fcd8 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -159,6 +159,7 @@ class EntityUpdatesWorker(FatcatWorker): link_source = ingest_request.get('ingest_request') ingest_type = ingest_request.get('ingest_type') doi = ingest_request.get('ext_ids', {}).get('doi') + es = release_to_elasticsearch(release) is_document = release.release_type in ( 'article', @@ -191,17 +192,16 @@ class EntityUpdatesWorker(FatcatWorker): 'stub', ) - if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): + # accept list sets a default "crawl it" despite OA metadata for + # known-OA DOI prefixes + in_acceptlist = False + if doi: + for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: + if doi.startswith(prefix): + in_acceptlist = True - # accept list sets a default "crawl it" despite OA metadata for - # known-OA DOI prefixes - in_acceptlist = False - if doi: - for prefix in self.live_pdf_ingest_doi_prefix_acceptlist: - if doi.startswith(prefix): - in_acceptlist = True + if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'): - es = release_to_elasticsearch(release) # most datacite documents are in IRs and should be crawled is_datacite_doc = False if release.extra and ('datacite' in release.extra) and is_document: @@ -209,6 +209,12 @@ class EntityUpdatesWorker(FatcatWorker): if not (es['is_oa'] or in_acceptlist or is_datacite_doc): return False + # big publishers *generally* have accurate OA metadata, use + # preservation networks, and block our crawlers. So unless OA, or + # explicitly on accept list, or not preserved, skip crawling + if es['publisher_type'] == 'big5' and es['is_preserved'] and not (es['is_oa'] or in_acceptlist): + return False + # if ingest_type is pdf but release_type is almost certainly not a PDF, # skip it. This is mostly a datacite thing. if ingest_type == "pdf" and is_not_pdf: |