summaryrefslogtreecommitdiffstats
path: root/python/fatcat_tools/transforms/access.py
blob: e3228d30f5eb5006160e3458b61e8fdb577b86ef (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from enum import Enum
from typing import List, Optional

from fatcat_openapi_client import ReleaseEntity
from pydantic import BaseModel


class AccessType(str, Enum):
    """describes type of access URL"""

    wayback = "wayback"
    ia_file = "ia_file"
    ia_microfilm = "ia_microfilm"
    repository = "repository"
    openlibrary = "openlibrary"
    wikipedia = "wikipedia"


class AccessOption(BaseModel):

    access_type: AccessType

    # note: for `target_url` refs, would do a CDX lookup and this URL would be
    # a valid/HTTP-200 web.archive.org capture URL
    access_url: str

    # application/pdf, text/html, etc
    # blank for landing pages
    mimetype: Optional[str]

    size_bytes: Optional[int]
    thumbnail_url: Optional[str]


def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
    """
    Extracts access options from a release.

    TODO: proper implementation and filtering, instead of just returning first
    option found
    """
    options: List[AccessOption] = []
    for f in release.files or []:
        thumbnail_url = None
        if f.mimetype == "application/pdf" and f.sha1 and f.urls:
            # NOTE: scholar.archive.org does an actual database check before
            # generating these URLs, but we skip that for speed
            thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg"
        for u in f.urls or []:
            if "://web.archive.org/" in u.url:
                return [
                    AccessOption(
                        access_type="wayback",
                        access_url=u.url,
                        mimetype=f.mimetype,
                        size_bytes=f.size,
                        thumbnail_url=thumbnail_url,
                    )
                ]
            elif "://archive.org/" in u.url:
                return [
                    AccessOption(
                        access_type="ia_file",
                        access_url=u.url,
                        mimetype=f.mimetype,
                        size_bytes=f.size,
                        thumbnail_url=thumbnail_url,
                    )
                ]
    return options