aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-28 13:34:17 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-28 13:34:19 -0800
commitfbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd (patch)
tree452c90ccc98aa1cc69be9a1bf3f163ebde41aec9
parent2e3988fcf6441bef7ee4b030e499fd129e7cb189 (diff)
downloadfatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.tar.gz
fatcat-fbc3cfc2594d90f8a39ee7f6ad2dfd323bcd76dd.zip
remove 'oa_only' feature from ingest transform
Refactoring to move this filter elsewhere
-rwxr-xr-xpython/fatcat_ingest.py1
-rw-r--r--python/fatcat_tools/transforms/ingest.py15
2 files changed, 1 insertions, 15 deletions
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 0df4674e..6ce36974 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -75,7 +75,6 @@ def run_ingest_container(args):
release = args.api.get_release(esr.ident)
ingest_request = release_ingest_request(
release,
- oa_only=False,
ingest_request_source="fatcat-ingest-container",
)
if not ingest_request:
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index d6393753..27a4fb93 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -1,7 +1,5 @@
-from .elasticsearch import release_to_elasticsearch
-
-def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat', ingest_type=None):
+def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):
"""
Takes a full release entity object and returns an ingest request (as dict),
or None if it seems like this release shouldn't be ingested.
@@ -9,12 +7,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat
The release entity should have the container, file, fileset, and webcapture
fields set.
- The 'oa_only' boolean flag indicates that we should only return an ingest
- request if we have reason to believe this is an OA release (or, eg, in
- arxiv or pubmed central). Respecting this flag means we are likely to miss
- a lot of "hybrid" and "bronze" content, but could reduce crawl load
- significantly.
-
The type of the ingest request may depend on release type and container
metadata (eg, as to whether we expect a PDF, datasets, web page), so
calling code should check the returned type field.
@@ -48,11 +40,6 @@ def release_ingest_request(release, oa_only=False, ingest_request_source='fatcat
ext_ids = release.ext_ids.to_dict()
ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])
- if oa_only and link_source not in ('arxiv', 'pmc'):
- es = release_to_elasticsearch(release)
- if not es['is_oa']:
- return None
-
# TODO: infer ingest type based on release_type or container metadata?
if not ingest_type:
ingest_type = 'pdf'