diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-14 18:37:58 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-07-14 18:37:58 -0700 |
commit | b53ea00def23fcff27d0bffa8e94376a56538b97 (patch) | |
tree | 3f3359b1368022f727245488ace5aea3f0de1312 /python/fatcat_web/hacks.py | |
parent | e4e71da47cc9567e906565a28393d371905b6464 (diff) | |
download | fatcat-b53ea00def23fcff27d0bffa8e94376a56538b97.tar.gz fatcat-b53ea00def23fcff27d0bffa8e94376a56538b97.zip |
function to generate correct PDF shard path from sha1
Diffstat (limited to 'python/fatcat_web/hacks.py')
-rw-r--r-- | python/fatcat_web/hacks.py | 32 |
1 files changed, 32 insertions, 0 deletions
diff --git a/python/fatcat_web/hacks.py b/python/fatcat_web/hacks.py index 9e6f6ab5..2415342c 100644 --- a/python/fatcat_web/hacks.py +++ b/python/fatcat_web/hacks.py @@ -25,3 +25,35 @@ def wayback_suffix(entity): ret = "*/" ret += entity.original_url return ret + +def camp_sha1_path(sha1): + assert len(sha1) == 40 + if sha1[0:2] > 'a9' or (sha1[0:2] == 'a9' and sha1[2:4] >= '48'): + shard = 'pdf3' + elif sha1[0:2] < '53' or (sha1[0:2] == '53' and sha1[2:4] < 'db'): + shard = 'pdf1' + else: + shard = 'pdf2' + return '/{}/pdf/{}/{}/{}.pdf'.format( + shard, sha1[0:2], sha1[2:4], sha1) + +def test_camp_sha1_path(): + assert camp_sha1_path('00000088bbc15a03ab89d8da6c356bf25aea9519') == '/pdf1/pdf/00/00/00000088bbc15a03ab89d8da6c356bf25aea9519.pdf' + assert camp_sha1_path('53da34b7df640a5065e347ea99e40302154225d5') == '/pdf1/pdf/53/da/53da34b7df640a5065e347ea99e40302154225d5.pdf' + assert camp_sha1_path('53db34b7df640a5065e347ea99e40302154225d5') == '/pdf2/pdf/53/db/53db34b7df640a5065e347ea99e40302154225d5.pdf' + assert camp_sha1_path('a947d6e2e55dbe649cfde780991f3989e235843d') == '/pdf2/pdf/a9/47/a947d6e2e55dbe649cfde780991f3989e235843d.pdf' + assert camp_sha1_path('a948d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/a9/48/a948d6e2e55dbe649cfde780991f3989e235843d.pdf' + assert camp_sha1_path('ff48d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/ff/48/ff48d6e2e55dbe649cfde780991f3989e235843d.pdf' + +def get_camp_pdf_path(release): + """ + Assumes the release has been expanded (includes file entities). + + Returns a full local URL to a PDF of the file if one should be available; + otherwise returns None. + """ + for f in release.files: + for u in f.urls: + if '://web.archive.org/' in u.url: + return camp_sha1_path(f.sha1) + return None |