diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:09:17 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:09:19 -0800 |
commit | 2536282cc635ce605c0bfd63cf7a9c0e10ef883c (patch) | |
tree | be0729976452143065c8b35f57931f644fb9f9cf /python | |
parent | 1dbc2613d406f3f94bc0ea29621bc81eacc7cea3 (diff) | |
download | sandcrawler-2536282cc635ce605c0bfd63cf7a9c0e10ef883c.tar.gz sandcrawler-2536282cc635ce605c0bfd63cf7a9c0e10ef883c.zip |
SPNv2 hack specifically for elsevier lookups
I'm not really sure why this is needed, and maybe with more careful
testing it isn't. But it works!
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index da4d2b7..73f8484 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -702,6 +702,21 @@ class SavePageNowClient: body=None, cdx=None, ) + #print(spn_result, file=sys.stderr) + + cdx_row = None + # hack to work around elsevier weirdness + if "://pdf.sciencedirectassets.com/" in spn_result.request_url: + elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best( + spn_result.request_url, + best_mimetype="application/pdf", + ) + if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf": + print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr) + cdx_row = elsevier_pdf_cdx + else: + print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) + #print(elsevier_pdf_cdx, file=sys.stderr) # fetch exact CDX row cdx_row = wayback_client.cdx_client.fetch( |