From 0a2c5e5c71d920cd2e7634040561a044d9e40d58 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 30 Mar 2020 09:49:04 -0700 Subject: update wanfang scrape --- scrape/README.md | 8 ++++++++ scrape/parse_wanfang_html.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/scrape/README.md b/scrape/README.md index bf31fdb..97bb6fe 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -34,3 +34,11 @@ Maybe need to set a referer, something like that? ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json +Download PDFs (without clobbering existing): + + cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {} + + file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c + 144 HTML + 609 PDF + diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py index 1146528..85187f5 100755 --- a/scrape/parse_wanfang_html.py +++ b/scrape/parse_wanfang_html.py @@ -23,7 +23,7 @@ def parse_wanfang_html(wanfang_html): tag_div = paper_li.find('div', **{'class': 'tag'}) paper = dict( is_first_issue=is_first_issue, - info_url="http://subject.med.wanfangdata.com.cn" + title_a['href'], + url="http://subject.med.wanfangdata.com.cn" + title_a['href'], wanfang_id=title_a['href'].split('/')[-1], title=title_a.get_text().strip(), journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), -- cgit v1.2.3