blob: 2415342cadbe02a5a6bac0959219220283e46167 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
import re
STRIP_EXTLINK_XML_RE = re.compile(r"<ext-link.*xlink:type=\"simple\">")
def strip_extlink_xml(unstr):
unstr = unstr.replace("</ext-link>", "")
unstr = STRIP_EXTLINK_XML_RE.sub("", unstr)
return unstr
def test_strip_extlink_xml():
assert strip_extlink_xml("asdf") == "asdf"
assert strip_extlink_xml("""LOCKSS (2014) Available: <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://lockss.org/" xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014 November 1.""") == \
"""LOCKSS (2014) Available: http://lockss.org/. Accessed: 2014 November 1."""
def wayback_suffix(entity):
"""
Takes a webcapture entity and returns a suffix to be appended to wayback URLs
"""
ret = ""
if entity.original_url:
if entity.timestamp:
ret = entity.timestamp.strftime("%Y%m%d%H%M%S/")
else:
ret = "*/"
ret += entity.original_url
return ret
def camp_sha1_path(sha1):
assert len(sha1) == 40
if sha1[0:2] > 'a9' or (sha1[0:2] == 'a9' and sha1[2:4] >= '48'):
shard = 'pdf3'
elif sha1[0:2] < '53' or (sha1[0:2] == '53' and sha1[2:4] < 'db'):
shard = 'pdf1'
else:
shard = 'pdf2'
return '/{}/pdf/{}/{}/{}.pdf'.format(
shard, sha1[0:2], sha1[2:4], sha1)
def test_camp_sha1_path():
assert camp_sha1_path('00000088bbc15a03ab89d8da6c356bf25aea9519') == '/pdf1/pdf/00/00/00000088bbc15a03ab89d8da6c356bf25aea9519.pdf'
assert camp_sha1_path('53da34b7df640a5065e347ea99e40302154225d5') == '/pdf1/pdf/53/da/53da34b7df640a5065e347ea99e40302154225d5.pdf'
assert camp_sha1_path('53db34b7df640a5065e347ea99e40302154225d5') == '/pdf2/pdf/53/db/53db34b7df640a5065e347ea99e40302154225d5.pdf'
assert camp_sha1_path('a947d6e2e55dbe649cfde780991f3989e235843d') == '/pdf2/pdf/a9/47/a947d6e2e55dbe649cfde780991f3989e235843d.pdf'
assert camp_sha1_path('a948d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/a9/48/a948d6e2e55dbe649cfde780991f3989e235843d.pdf'
assert camp_sha1_path('ff48d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/ff/48/ff48d6e2e55dbe649cfde780991f3989e235843d.pdf'
def get_camp_pdf_path(release):
"""
Assumes the release has been expanded (includes file entities).
Returns a full local URL to a PDF of the file if one should be available;
otherwise returns None.
"""
for f in release.files:
for u in f.urls:
if '://web.archive.org/' in u.url:
return camp_sha1_path(f.sha1)
return None
|