aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/access.py
blob: 231cd2b331c33b42fca33281ac28db70ef99c68d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

from enum import Enum
from typing import Optional, List, Any, Dict

from pydantic import BaseModel
from fatcat_openapi_client import ReleaseEntity


class AccessType(str, Enum):
    """describes type of access URL"""

    wayback = "wayback"
    ia_file = "ia_file"
    ia_microfilm = "ia_microfilm"
    repository = "repository"

class AccessOption(BaseModel):

    access_type: AccessType

    # note: for `target_url` refs, would do a CDX lookup and this URL would be
    # a valid/HTTP-200 web.archive.org capture URL
    access_url: str

    # application/pdf, text/html, etc
    # blank for landing pages
    mimetype: Optional[str]

    size_bytes: Optional[int]
    thumbnail_url: Optional[str]


def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
    """
    Extracts access options from a release.

    TODO: proper implementation
    """
    options = []
    for f in (release.files or []):
        for u in (f.urls or []):
            if '://web.archive.org/' in u.url:
                return [AccessOption(
                    access_type="wayback",
                    access_url=u.url,
                    mimetype=f.mimetype,
                    size_bytes=f.size,
                    thumbnail_url=None
                )]
            elif '://archive.org/' in u.url:
                return [AccessOption(
                    access_type="ia_file",
                    access_url=u.url,
                    mimetype=f.mimetype,
                    size_bytes=f.size,
                    thumbnail_url=None
                )]
    return options