1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
from enum import Enum
from typing import Optional, List
from pydantic import BaseModel
from fatcat_openapi_client import ReleaseEntity
class AccessType(str, Enum):
"""describes type of access URL"""
wayback = "wayback"
ia_file = "ia_file"
ia_microfilm = "ia_microfilm"
repository = "repository"
openlibrary = "openlibrary"
wikipedia = "wikipedia"
class AccessOption(BaseModel):
access_type: AccessType
# note: for `target_url` refs, would do a CDX lookup and this URL would be
# a valid/HTTP-200 web.archive.org capture URL
access_url: str
# application/pdf, text/html, etc
# blank for landing pages
mimetype: Optional[str]
size_bytes: Optional[int]
thumbnail_url: Optional[str]
def release_access_options(release: ReleaseEntity) -> List[AccessOption]:
"""
Extracts access options from a release.
TODO: proper implementation and filtering, instead of just returning first
option found
"""
options = []
for f in (release.files or []):
thumbnail_url = None
if f.mimetype == 'application/pdf' and f.sha1 and f.urls:
# NOTE: scholar.archive.org does an actual database check before
# generating these URLs, but we skip that for speed
thumbnail_url = f"https://blobs.fatcat.wiki/thumbnail/pdf/{f.sha1[0:2]}/{f.sha1[2:4]}/{f.sha1}.180px.jpg"
for u in (f.urls or []):
if '://web.archive.org/' in u.url:
return [AccessOption(
access_type="wayback",
access_url=u.url,
mimetype=f.mimetype,
size_bytes=f.size,
thumbnail_url=thumbnail_url,
)]
elif '://archive.org/' in u.url:
return [AccessOption(
access_type="ia_file",
access_url=u.url,
mimetype=f.mimetype,
size_bytes=f.size,
thumbnail_url=thumbnail_url,
)]
return options
|