diff options
Diffstat (limited to 'python')
| -rw-r--r-- | python/fatcat_tools/transforms/ingest.py | 18 | ||||
| -rw-r--r-- | python/tests/transform_ingest.py | 6 | 
2 files changed, 23 insertions, 1 deletions
| diff --git a/python/fatcat_tools/transforms/ingest.py b/python/fatcat_tools/transforms/ingest.py index 59831017..9aaeaa84 100644 --- a/python/fatcat_tools/transforms/ingest.py +++ b/python/fatcat_tools/transforms/ingest.py @@ -1,4 +1,19 @@ +INGEST_TYPE_CONTAINER_MAP = { +    # Optica +    "twtpsm6ytje3nhuqfu3pa7ca7u": "html", +    # Optics Express +    "cg4vcsfty5dfvgmat5wm62wgie": "html", +    # First Monday +    "svz5ul6qozdjhjhk7d627avuja": "html", +    # D-Lib Magazine +    "ugbiirfvufgcjkx33r3cmemcuu": "html", +    # Distill (distill.pub) +    "lx7svdzmc5dl3ay4zncjjrql7i": "html", +    # NLM technical bulletin +    "lovwr7ladjagzkhmoaszg7efqu": "html", +} +  def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=None):      """      Takes a full release entity object and returns an ingest request (as dict), @@ -15,7 +30,8 @@ def release_ingest_request(release, ingest_request_source='fatcat', ingest_type=      if release.state != 'active':          return None -    # TODO: infer ingest type based on release_type or container metadata? +    if (not ingest_type) and release.container_id: +        ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)      if not ingest_type:          ingest_type = 'pdf' diff --git a/python/tests/transform_ingest.py b/python/tests/transform_ingest.py index c7044bc0..a61aa4a7 100644 --- a/python/tests/transform_ingest.py +++ b/python/tests/transform_ingest.py @@ -54,3 +54,9 @@ def test_rich_ingest_release():      assert ir['base_url'] == 'https://doi.org/10.123/456'      assert ir['ext_ids']['doi'] == '10.123/456'      assert ir['ext_ids'].get('pmcid') is None +    assert ir['ingest_type'] == 'pdf' + +    # check ingest type ("d-lib") +    r.container_id = "ugbiirfvufgcjkx33r3cmemcuu" +    ir = release_ingest_request(r) +    assert ir['ingest_type'] == 'html' | 
