From 2536282cc635ce605c0bfd63cf7a9c0e10ef883c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 10 Jan 2020 16:09:17 -0800 Subject: SPNv2 hack specifically for elsevier lookups I'm not really sure why this is needed, and maybe with more careful testing it isn't. But it works! --- python/sandcrawler/ia.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index da4d2b7..73f8484 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -702,6 +702,21 @@ class SavePageNowClient: body=None, cdx=None, ) + #print(spn_result, file=sys.stderr) + + cdx_row = None + # hack to work around elsevier weirdness + if "://pdf.sciencedirectassets.com/" in spn_result.request_url: + elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best( + spn_result.request_url, + best_mimetype="application/pdf", + ) + if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf": + print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr) + cdx_row = elsevier_pdf_cdx + else: + print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr) + #print(elsevier_pdf_cdx, file=sys.stderr) # fetch exact CDX row cdx_row = wayback_client.cdx_client.fetch( -- cgit v1.2.3