diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-13 19:58:36 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-13 19:58:36 -0700 |
commit | ff36a4372e3f24efa531bfe6156e4ee08d458e08 (patch) | |
tree | 373b9909c50f22e498df9a2761045aa6085bbb42 | |
parent | dc0841329257f037260b225b66ef80a73fbebea7 (diff) | |
download | sandcrawler-ff36a4372e3f24efa531bfe6156e4ee08d458e08.tar.gz sandcrawler-ff36a4372e3f24efa531bfe6156e4ee08d458e08.zip |
html: attempt at CNKI href extraction
-rw-r--r-- | python/sandcrawler/html.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 04b3afe..b924a17 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -321,4 +321,15 @@ def extract_fulltext_url(html_url, html_body): url = host_prefix + url return dict(pdf_url=url, technique='eurosurveillance-href') + # CNKI COVID-19 landing pages + # http://en.gzbd.cnki.net/gzbt/detail/detail.aspx?FileName=HBGF202002003&DbName=GZBJ7920&DbCode=GZBJ + if '://en.gzbd.cnki.net/KCMS/detail/detail.aspx' in html_url: + # <a onclick="WriteKrsDownLog()" target="_blank" id="pdfDown" name="pdfDown" href="/gzbt/download.aspx?filename=4Q1ZYpFdKFUZ6FDR1QkRrolayRXV2ZzattyQ3QFa2JXTyZXUSV3QRFkbndzaGV2KyJXWZVEbFdVYnZndD9EOxg1Tj5Eeys2SMFzLZ5kcuFkM3dEbsR2ZjxEaShVdJhFdp90KhlVVzcjVVlXUVNHWBtWS5Rlb5cnc&tablename=GZBJLAST2020&dflag=pdfdown
 "><i></i>PDF Download</a> + href = soup.find('a', attrs={"id":"pdfDown"}) + if href: + url = href['href'].strip().replace('
', '') + if not url.startswith('http'): + url = host_prefix + url + return dict(pdf_url=url, technique='cnki-href') + return dict() |