summaryrefslogtreecommitdiffstats
path: root/python/fatcat_web/hacks.py
blob: 0339a0d7522bed7237384ff71d823f63df3b79b4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re

from fatcat_openapi_client import WebcaptureEntity

STRIP_EXTLINK_XML_RE = re.compile(r"<ext-link.*xlink:type=\"simple\">")


def strip_extlink_xml(unstr: str) -> str:
    unstr = unstr.replace("</ext-link>", "")
    unstr = STRIP_EXTLINK_XML_RE.sub("", unstr)
    return unstr


def test_strip_extlink_xml() -> None:
    assert strip_extlink_xml("asdf") == "asdf"
    assert (
        strip_extlink_xml(
            """LOCKSS (2014) Available: <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://lockss.org/" xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014 November 1."""
        )
        == """LOCKSS (2014) Available: http://lockss.org/. Accessed: 2014 November 1."""
    )


def wayback_suffix(entity: WebcaptureEntity) -> str:
    """
    Takes a webcapture entity and returns a suffix to be appended to wayback URLs
    """
    ret = ""
    if entity.original_url:
        if entity.timestamp:
            ret = entity.timestamp.strftime("%Y%m%d%H%M%S/")
        else:
            ret = "*/"
        ret += entity.original_url
    return ret