aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-13 19:58:36 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-13 19:58:36 -0700
commitff36a4372e3f24efa531bfe6156e4ee08d458e08 (patch)
tree373b9909c50f22e498df9a2761045aa6085bbb42
parentdc0841329257f037260b225b66ef80a73fbebea7 (diff)
downloadsandcrawler-ff36a4372e3f24efa531bfe6156e4ee08d458e08.tar.gz
sandcrawler-ff36a4372e3f24efa531bfe6156e4ee08d458e08.zip
html: attempt at CNKI href extraction
-rw-r--r--python/sandcrawler/html.py11
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 04b3afe..b924a17 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -321,4 +321,15 @@ def extract_fulltext_url(html_url, html_body):
url = host_prefix + url
return dict(pdf_url=url, technique='eurosurveillance-href')
+ # CNKI COVID-19 landing pages
+ # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ
+ if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url:
+ # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&amp;tablename=GZBJLAST2020&amp;dflag=pdfdown&#xA; "><i></i>PDF Download</a>
+ href = soup.find('a', attrs={"id":"pdfDown"})
+ if href:
+ url = href['href'].strip().replace('&#xA;', '')
+ if not url.startswith('http'):
+ url = host_prefix + url
+ return dict(pdf_url=url, technique='cnki-href')
+
return dict()