From fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 28 Jan 2020 13:34:17 -0800 Subject: remove 'oa_only' feature from ingest transform Refactoring to move this filter elsewhere --- python/fatcat_ingest.py | 1 - python/fatcat_tools/transforms/ingest.py | 15 +-------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 0df4674e..6ce36974 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -75,7 +75,6 @@ def run_ingest_container(args): release = args.api.get_release(esr.ident) ingest_request = release_ingest_request( release, - oa_only=False, ingest_request_source="fatcat-ingest-container", ) if not ingest_request: diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index d6393753..27a4fb93 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,7 +1,5 @@ -from .elasticsearch import release_to_elasticsearch - -def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type=None): +def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -9,12 +7,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat The release entity should have the container, file, fileset, and webcapture fields set. - The 'oa_only' boolean flag indicates that we should only return an ingest - request if we have reason to believe this is an OA release (or, eg, in - arxiv or pubmed central). Respecting this flag means we are likely to miss - a lot of "hybrid" and "bronze" content, but could reduce crawl load - significantly. - The type of the ingest request may depend on release type and container metadata (eg, as to whether we expect a PDF, datasets, web page), so calling code should check the returned type field. @@ -48,11 +40,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - if oa_only and link_source not in ('arxiv', 'pmc'): - es = release_to_elasticsearch(release) - if not es['is_oa']: - return None - # TODO: infer ingest type based on release_type or container metadata? if not ingest_type: ingest_type = 'pdf' -- cgit v1.2.3