diff options
Diffstat (limited to 'python/fatcat_web/hacks.py')
-rw-r--r-- | python/fatcat_web/hacks.py | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/python/fatcat_web/hacks.py b/python/fatcat_web/hacks.py new file mode 100644 index 00000000..9e6f6ab5 --- /dev/null +++ b/python/fatcat_web/hacks.py @@ -0,0 +1,27 @@ + +import re + +STRIP_EXTLINK_XML_RE = re.compile(r"<ext-link.*xlink:type=\"simple\">") + +def strip_extlink_xml(unstr): + unstr = unstr.replace("</ext-link>", "") + unstr = STRIP_EXTLINK_XML_RE.sub("", unstr) + return unstr + +def test_strip_extlink_xml(): + assert strip_extlink_xml("asdf") == "asdf" + assert strip_extlink_xml("""LOCKSS (2014) Available: <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://lockss.org/" xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014 November 1.""") == \ + """LOCKSS (2014) Available: http://lockss.org/. Accessed: 2014 November 1.""" + +def wayback_suffix(entity): + """ + Takes a webcapture entity and returns a suffix to be appended to wayback URLs + """ + ret = "" + if entity.original_url: + if entity.timestamp: + ret = entity.timestamp.strftime("%Y%m%d%H%M%S/") + else: + ret = "*/" + ret += entity.original_url + return ret |