summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-05-21 17:46:37 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-05-21 17:46:37 -0700
commitbe28fd5d4a600bc29507418fddfdc95802abc98f (patch)
treec32bae2797ff065b8e3668f529ee92c2a33ac6a0
parent1e7c692d00d51875b62686c99b3a782b0c3c8fca (diff)
downloadfatcat-be28fd5d4a600bc29507418fddfdc95802abc98f.tar.gz
fatcat-be28fd5d4a600bc29507418fddfdc95802abc98f.zip
ingest: add per-container ingest type overrides
-rw-r--r--python/fatcat_tools/transforms/ingest.py18
-rw-r--r--python/tests/transform_ingest.py6
2 files changed, 23 insertions, 1 deletions
diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py
index 59831017..9aaeaa84 100644
--- a/python/fatcat_tools/transforms/ingest.py
+++ b/python/fatcat_tools/transforms/ingest.py
@@ -1,4 +1,19 @@
+INGEST_TYPE_CONTAINER_MAP = {
+ # Optica
+ "twtpsm6ytje3nhuqfu3pa7ca7u": "html",
+ # Optics Express
+ "cg4vcsfty5dfvgmat5wm62wgie": "html",
+ # First Monday
+ "svz5ul6qozdjhjhk7d627avuja": "html",
+ # D-Lib Magazine
+ "ugbiirfvufgcjkx33r3cmemcuu": "html",
+ # Distill (distill.pub)
+ "lx7svdzmc5dl3ay4zncjjrql7i": "html",
+ # NLM technical bulletin
+ "lovwr7ladjagzkhmoaszg7efqu": "html",
+}
+
def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):
"""
Takes a full release entity object and returns an ingest request (as dict),
@@ -15,7 +30,8 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=
if release.state != 'active':
return None
- # TODO: infer ingest type based on release_type or container metadata?
+ if (not ingest_type) and release.container_id:
+ ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)
if not ingest_type:
ingest_type = 'pdf'
diff --git a/python/tests/transform_ingest.py b/python/tests/transform_ingest.py
index c7044bc0..a61aa4a7 100644
--- a/python/tests/transform_ingest.py
+++ b/python/tests/transform_ingest.py
@@ -54,3 +54,9 @@ def test_rich_ingest_release():
assert ir['base_url'] == 'https://doi.org/10.123/456'
assert ir['ext_ids']['doi'] == '10.123/456'
assert ir['ext_ids'].get('pmcid') is None
+ assert ir['ingest_type'] == 'pdf'
+
+ # check ingest type ("d-lib")
+ r.container_id = "ugbiirfvufgcjkx33r3cmemcuu"
+ ir = release_ingest_request(r)
+ assert ir['ingest_type'] == 'html'