From b53ea00def23fcff27d0bffa8e94376a56538b97 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 14 Jul 2019 18:37:58 -0700 Subject: function to generate correct PDF shard path from sha1 --- python/fatcat_web/hacks.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'python/fatcat_web') diff --git a/python/fatcat_web/hacks.py b/python/fatcat_web/hacks.py index 9e6f6ab5..2415342c 100644 --- a/python/fatcat_web/hacks.py +++ b/python/fatcat_web/hacks.py @@ -25,3 +25,35 @@ def wayback_suffix(entity): ret = "*/" ret += entity.original_url return ret + +def camp_sha1_path(sha1): + assert len(sha1) == 40 + if sha1[0:2] > 'a9' or (sha1[0:2] == 'a9' and sha1[2:4] >= '48'): + shard = 'pdf3' + elif sha1[0:2] < '53' or (sha1[0:2] == '53' and sha1[2:4] < 'db'): + shard = 'pdf1' + else: + shard = 'pdf2' + return '/{}/pdf/{}/{}/{}.pdf'.format( + shard, sha1[0:2], sha1[2:4], sha1) + +def test_camp_sha1_path(): + assert camp_sha1_path('00000088bbc15a03ab89d8da6c356bf25aea9519') == '/pdf1/pdf/00/00/00000088bbc15a03ab89d8da6c356bf25aea9519.pdf' + assert camp_sha1_path('53da34b7df640a5065e347ea99e40302154225d5') == '/pdf1/pdf/53/da/53da34b7df640a5065e347ea99e40302154225d5.pdf' + assert camp_sha1_path('53db34b7df640a5065e347ea99e40302154225d5') == '/pdf2/pdf/53/db/53db34b7df640a5065e347ea99e40302154225d5.pdf' + assert camp_sha1_path('a947d6e2e55dbe649cfde780991f3989e235843d') == '/pdf2/pdf/a9/47/a947d6e2e55dbe649cfde780991f3989e235843d.pdf' + assert camp_sha1_path('a948d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/a9/48/a948d6e2e55dbe649cfde780991f3989e235843d.pdf' + assert camp_sha1_path('ff48d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/ff/48/ff48d6e2e55dbe649cfde780991f3989e235843d.pdf' + +def get_camp_pdf_path(release): + """ + Assumes the release has been expanded (includes file entities). + + Returns a full local URL to a PDF of the file if one should be available; + otherwise returns None. + """ + for f in release.files: + for u in f.urls: + if '://web.archive.org/' in u.url: + return camp_sha1_path(f.sha1) + return None -- cgit v1.2.3