aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-22 16:21:58 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-22 16:21:58 -0800
commita2a652cefdfa54c7d6bf16dfcf8b1e2e45fb8947 (patch)
treea369ef48c29156c8a204e10824091ae01a48a4b7 /python
parent24ef8310c106ea020a34a6cb48e2ccca4b2c3c18 (diff)
downloadsandcrawler-a2a652cefdfa54c7d6bf16dfcf8b1e2e45fb8947.tar.gz
sandcrawler-a2a652cefdfa54c7d6bf16dfcf8b1e2e45fb8947.zip
html: degruyter extraction; disabled journals.lww.com
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html.py19
1 files changed, 19 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 1893898..2b173b0 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -239,4 +239,23 @@ def extract_fulltext_url(html_url, html_body):
url = html_url + ".pdf"
return dict(pdf_url=url, technique='protocolsio-url')
+ # degruyter.com
+ # https://www.degruyter.com/view/books/9783486594621/9783486594621-009/9783486594621-009.xml
+ if "://www.degruyter.com/view/" in html_url and html_url.endswith(".xml"):
+ url = html_url.replace('/view/', '/downloadpdf/').replace('.xml', '.pdf')
+ return dict(pdf_url=url, technique='degruyter-url')
+
+ # journals.lww.com (Wolters Kluwer)
+ # https://journals.lww.com/spinejournal/Abstract/publishahead/Making_the_Most_of_Systematic_Reviews_and.94318.aspx
+ # DISABLED: they seem to redirect our crawler back to a "Fulltext" page and
+ # we never get the content.
+ if "://journals.lww.com/" in html_url and False:
+ # data-pdf-url="https://pdfs.journals.lww.com/spinejournal/9000/00000/Making_the_Most_of_Systematic_Reviews_and.94318.pdf?token=method|ExpireAbsolute;source|Journals;ttl|1582413672903;payload|mY8D3u1TCCsNvP5E421JYK6N6XICDamxByyYpaNzk7FKjTaa1Yz22MivkHZqjGP4kdS2v0J76WGAnHACH69s21Csk0OpQi3YbjEMdSoz2UhVybFqQxA7lKwSUlA502zQZr96TQRwhVlocEp/sJ586aVbcBFlltKNKo+tbuMfL73hiPqJliudqs17cHeLcLbV/CqjlP3IO0jGHlHQtJWcICDdAyGJMnpi6RlbEJaRheGeh5z5uvqz3FLHgPKVXJzdiVgCTnUeUQFYzcJRFhNtc2gv+ECZGji7HUicj1/6h85Y07DBRl1x2MGqlHWXUawD;hash|6cqYBa15ZK407m4VhFfJLw=="
+ for line in html_body.split(b'\n'):
+ if b"data-pdf-url=" in line:
+ line = line.decode('utf-8')
+ url = line.strip().replace('data-pdf-url=', '').replace('"', '')
+ if url.startswith('http') and 'pdfs.journals.lww.com' in url:
+ return dict(pdf_url=url, technique='journals.lww.com-jsvar')
+
return dict()