From bb085c92760d6ccbd6c92e13fcae0af02b5a3d17 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 15 Apr 2021 23:29:34 -0700 Subject: partial access options transform for releases --- python/fatcat_tools/transforms/access.py | 58 ++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 python/fatcat_tools/transforms/access.py (limited to 'python/fatcat_tools/transforms/access.py') diff --git a/python/fatcat_tools/transforms/access.py b/python/fatcat_tools/transforms/access.py new file mode 100644 index 00000000..231cd2b3 --- /dev/null +++ b/python/fatcat_tools/transforms/access.py @@ -0,0 +1,58 @@ + +from enum import Enum +from typing import Optional, List, Any, Dict + +from pydantic import BaseModel +from fatcat_openapi_client import ReleaseEntity + + +class AccessType(str, Enum): + """describes type of access URL""" + + wayback = "wayback" + ia_file = "ia_file" + ia_microfilm = "ia_microfilm" + repository = "repository" + +class AccessOption(BaseModel): + + access_type: AccessType + + # note: for `target_url` refs, would do a CDX lookup and this URL would be + # a valid/HTTP-200 web.archive.org capture URL + access_url: str + + # application/pdf, text/html, etc + # blank for landing pages + mimetype: Optional[str] + + size_bytes: Optional[int] + thumbnail_url: Optional[str] + + +def release_access_options(release: ReleaseEntity) -> List[AccessOption]: + """ + Extracts access options from a release. + + TODO: proper implementation + """ + options = [] + for f in (release.files or []): + for u in (f.urls or []): + if '://web.archive.org/' in u.url: + return [AccessOption( + access_type="wayback", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + elif '://archive.org/' in u.url: + return [AccessOption( + access_type="ia_file", + access_url=u.url, + mimetype=f.mimetype, + size_bytes=f.size, + thumbnail_url=None + )] + return options -- cgit v1.2.3