aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-10 16:09:17 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-10 16:09:19 -0800
commit2536282cc635ce605c0bfd63cf7a9c0e10ef883c (patch)
treebe0729976452143065c8b35f57931f644fb9f9cf
parent1dbc2613d406f3f94bc0ea29621bc81eacc7cea3 (diff)
downloadsandcrawler-2536282cc635ce605c0bfd63cf7a9c0e10ef883c.tar.gz
sandcrawler-2536282cc635ce605c0bfd63cf7a9c0e10ef883c.zip
SPNv2 hack specifically for elsevier lookups
I'm not really sure why this is needed, and maybe with more careful testing it isn't. But it works!
-rw-r--r--python/sandcrawler/ia.py15
1 files changed, 15 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index da4d2b7..73f8484 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -702,6 +702,21 @@ class SavePageNowClient:
body=None,
cdx=None,
)
+ #print(spn_result, file=sys.stderr)
+
+ cdx_row = None
+ # hack to work around elsevier weirdness
+ if "://pdf.sciencedirectassets.com/" in spn_result.request_url:
+ elsevier_pdf_cdx = wayback_client.cdx_client.lookup_best(
+ spn_result.request_url,
+ best_mimetype="application/pdf",
+ )
+ if elsevier_pdf_cdx and elsevier_pdf_cdx.mimetype == "application/pdf":
+ print("Trying pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ cdx_row = elsevier_pdf_cdx
+ else:
+ print("Failed pdf.sciencedirectassets.com hack!", file=sys.stderr)
+ #print(elsevier_pdf_cdx, file=sys.stderr)
# fetch exact CDX row
cdx_row = wayback_client.cdx_client.fetch(