diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-28 13:34:17 -0800 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-01-28 13:34:19 -0800 | 
| commit | fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd (patch) | |
| tree | 452c90ccc98aa1cc69be9a1bf3f163ebde41aec9 /python/fatcat_tools | |
| parent | 2e3988fcf6441bef7ee4b030e499fd129e7cb189 (diff) | |
| download | fatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.tar.gz fatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.zip | |
remove 'oa_only' feature from ingest transform
Refactoring to move this filter elsewhere
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 15 | 
1 files changed, 1 insertions, 14 deletions
| diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index d6393753..27a4fb93 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,7 +1,5 @@ -from .elasticsearch import release_to_elasticsearch - -def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type=None): +def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):      """      Takes a full release entity object and returns an ingest request (as dict),      or None if it seems like this release shouldn't be ingested. @@ -9,12 +7,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat      The release entity should have the container, file, fileset, and webcapture      fields set. -    The 'oa_only' boolean flag indicates that we should only return an ingest -    request if we have reason to believe this is an OA release (or, eg, in -    arxiv or pubmed central). Respecting this flag means we are likely to miss -    a lot of "hybrid" and "bronze" content, but could reduce crawl load -    significantly. -      The type of the ingest request may depend on release type and container      metadata (eg, as to whether we expect a PDF, datasets, web page), so      calling code should check the returned type field. @@ -48,11 +40,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat      ext_ids = release.ext_ids.to_dict()      ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v]) -    if oa_only and link_source not in ('arxiv', 'pmc'): -        es = release_to_elasticsearch(release) -        if not es['is_oa']: -            return None -      # TODO: infer ingest type based on release_type or container metadata?      if not ingest_type:          ingest_type = 'pdf' | 
