aboutsummaryrefslogtreecommitdiffstats
path: root/python/fatcat_web/hacks.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2019-07-14 18:37:58 -0700
committerBryan Newbold <bnewbold@robocracy.org>2019-07-14 18:37:58 -0700
commitb53ea00def23fcff27d0bffa8e94376a56538b97 (patch)
tree3f3359b1368022f727245488ace5aea3f0de1312 /python/fatcat_web/hacks.py
parente4e71da47cc9567e906565a28393d371905b6464 (diff)
downloadfatcat-b53ea00def23fcff27d0bffa8e94376a56538b97.tar.gz
fatcat-b53ea00def23fcff27d0bffa8e94376a56538b97.zip
function to generate correct PDF shard path from sha1
Diffstat (limited to 'python/fatcat_web/hacks.py')
-rw-r--r--python/fatcat_web/hacks.py32
1 files changed, 32 insertions, 0 deletions
diff --git a/python/fatcat_web/hacks.py b/python/fatcat_web/hacks.py
index 9e6f6ab5..2415342c 100644
--- a/python/fatcat_web/hacks.py
+++ b/python/fatcat_web/hacks.py
@@ -25,3 +25,35 @@ def wayback_suffix(entity):
ret = "*/"
ret += entity.original_url
return ret
+
+def camp_sha1_path(sha1):
+ assert len(sha1) == 40
+ if sha1[0:2] > 'a9' or (sha1[0:2] == 'a9' and sha1[2:4] >= '48'):
+ shard = 'pdf3'
+ elif sha1[0:2] < '53' or (sha1[0:2] == '53' and sha1[2:4] < 'db'):
+ shard = 'pdf1'
+ else:
+ shard = 'pdf2'
+ return '/{}/pdf/{}/{}/{}.pdf'.format(
+ shard, sha1[0:2], sha1[2:4], sha1)
+
+def test_camp_sha1_path():
+ assert camp_sha1_path('00000088bbc15a03ab89d8da6c356bf25aea9519') == '/pdf1/pdf/00/00/00000088bbc15a03ab89d8da6c356bf25aea9519.pdf'
+ assert camp_sha1_path('53da34b7df640a5065e347ea99e40302154225d5') == '/pdf1/pdf/53/da/53da34b7df640a5065e347ea99e40302154225d5.pdf'
+ assert camp_sha1_path('53db34b7df640a5065e347ea99e40302154225d5') == '/pdf2/pdf/53/db/53db34b7df640a5065e347ea99e40302154225d5.pdf'
+ assert camp_sha1_path('a947d6e2e55dbe649cfde780991f3989e235843d') == '/pdf2/pdf/a9/47/a947d6e2e55dbe649cfde780991f3989e235843d.pdf'
+ assert camp_sha1_path('a948d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/a9/48/a948d6e2e55dbe649cfde780991f3989e235843d.pdf'
+ assert camp_sha1_path('ff48d6e2e55dbe649cfde780991f3989e235843d') == '/pdf3/pdf/ff/48/ff48d6e2e55dbe649cfde780991f3989e235843d.pdf'
+
+def get_camp_pdf_path(release):
+ """
+ Assumes the release has been expanded (includes file entities).
+
+ Returns a full local URL to a PDF of the file if one should be available;
+ otherwise returns None.
+ """
+ for f in release.files:
+ for u in f.urls:
+ if '://web.archive.org/' in u.url:
+ return camp_sha1_path(f.sha1)
+ return None