python/fatcat_tools/transforms/ingest.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

from typing import Any, Dict, Optional

from fatcat_openapi_client import ReleaseEntity

INGEST_TYPE_CONTAINER_MAP: Dict[str, str] = {
    # Optica
    "twtpsm6ytje3nhuqfu3pa7ca7u": "html",
    # Optics Express
    "cg4vcsfty5dfvgmat5wm62wgie": "html",
    # First Monday
    "svz5ul6qozdjhjhk7d627avuja": "html",
    # D-Lib Magazine
    "ugbiirfvufgcjkx33r3cmemcuu": "html",
    # Distill (distill.pub)
    "lx7svdzmc5dl3ay4zncjjrql7i": "html",
    # NLM technical bulletin
    "lovwr7ladjagzkhmoaszg7efqu": "html",
}


def release_ingest_request(
    release: ReleaseEntity,
    ingest_request_source: str = "fatcat",
    ingest_type: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
    """
    Takes a full release entity object and returns an ingest request (as dict),
    or None if it seems like this release shouldn't be ingested.

    The release entity should have the container, file, fileset, and webcapture
    fields set.

    The type of the ingest request may depend on release type and container
    metadata (eg, as to whether we expect a PDF, datasets, web page), so
    calling code should check the returned type field.
    """

    if release.state != "active":
        return None

    if (not ingest_type) and release.container_id:
        ingest_type = INGEST_TYPE_CONTAINER_MAP.get(release.container_id)

    if not ingest_type:
        if release.release_type == "stub":
            return None
        elif release.release_type in ["component", "graphic"]:
            ingest_type = "component"
        elif release.release_type == "dataset":
            ingest_type = "dataset"
        elif release.release_type == "software":
            ingest_type = "software"
        elif release.release_type == "post-weblog":
            ingest_type = "html"
        elif release.release_type in [
            "article-journal",
            "article",
            "chapter",
            "paper-conference",
            "book",
            "report",
            "thesis",
        ]:
            ingest_type = "pdf"
        else:
            ingest_type = "pdf"

    # generate a URL where we expect to find fulltext
    url = None
    link_source = None
    link_source_id = None
    if release.ext_ids.arxiv and ingest_type == "pdf":
        url = "https://arxiv.org/pdf/{}.pdf".format(release.ext_ids.arxiv)
        link_source = "arxiv"
        link_source_id = release.ext_ids.arxiv
    elif release.ext_ids.pmcid and ingest_type == "pdf":
        # TODO: how to tell if an author manuscript in PMC vs. published?
        # url = "https://www.ncbi.nlm.nih.gov/pmc/articles/{}/pdf/".format(release.ext_ids.pmcid)
        url = "http://europepmc.org/backend/ptpmcrender.fcgi?accid={}&blobtype=pdf".format(
            release.ext_ids.pmcid
        )
        link_source = "pmc"
        link_source_id = release.ext_ids.pmcid
    elif release.ext_ids.doi:
        url = "https://doi.org/{}".format(release.ext_ids.doi.lower())
        link_source = "doi"
        link_source_id = release.ext_ids.doi.lower()
    elif release.ext_ids.doaj:
        url = "https://doaj.org/article/{}".format(release.ext_ids.doaj.lower())
        link_source = "doaj"
        link_source_id = release.ext_ids.doaj.lower()
    elif release.ext_ids.hdl:
        url = "https://hdl.handle.net/{}".format(release.ext_ids.hdl.lower())
        link_source = "hdl"
        link_source_id = release.ext_ids.hdl.lower()

    if not url:
        return None

    ext_ids = release.ext_ids.to_dict()
    ext_ids = dict([(k, v) for (k, v) in ext_ids.items() if v])

    ingest_request = {
        "ingest_type": ingest_type,
        "ingest_request_source": ingest_request_source,
        "base_url": url,
        "release_stage": release.release_stage,
        "fatcat": {
            "release_ident": release.ident,
            "work_ident": release.work_id,
        },
        "ext_ids": ext_ids,
    }

    if link_source and link_source_id:
        ingest_request["link_source"] = link_source
        ingest_request["link_source_id"] = link_source_id

    return ingest_request