diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-28 13:34:17 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-28 13:34:19 -0800 |
commit | fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd (patch) | |
tree | 452c90ccc98aa1cc69be9a1bf3f163ebde41aec9 /python/fatcat_tools/transforms/ingest.py | |
parent | 2e3988fcf6441bef7ee4b030e499fd129e7cb189 (diff) | |
download | fatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.tar.gz fatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.zip |
remove 'oa_only' feature from ingest transform
Refactoring to move this filter elsewhere
Diffstat (limited to 'python/fatcat_tools/transforms/ingest.py')
-rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 15 |
1 files changed, 1 insertions, 14 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index d6393753..27a4fb93 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,7 +1,5 @@ -from .elasticsearch import release_to_elasticsearch - -def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type=None): +def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None): """ Takes a full release entity object and returns an ingest request (as dict), or None if it seems like this release shouldn't be ingested. @@ -9,12 +7,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat The release entity should have the container, file, fileset, and webcapture fields set. - The 'oa_only' boolean flag indicates that we should only return an ingest - request if we have reason to believe this is an OA release (or, eg, in - arxiv or pubmed central). Respecting this flag means we are likely to miss - a lot of "hybrid" and "bronze" content, but could reduce crawl load - significantly. - The type of the ingest request may depend on release type and container metadata (eg, as to whether we expect a PDF, datasets, web page), so calling code should check the returned type field. @@ -48,11 +40,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat ext_ids = release.ext_ids.to_dict() ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) - if oa_only and link_source not in ('arxiv', 'pmc'): - es = release_to_elasticsearch(release) - if not es['is_oa']: - return None - # TODO: infer ingest type based on release_type or container metadata? if not ingest_type: ingest_type = 'pdf' |