aboutsummaryrefslogtreecommitdiffstats
path: root/scrape
diff options
context:
space:
mode:
Diffstat (limited to 'scrape')
-rw-r--r--scrape/README.md8
-rwxr-xr-xscrape/parse_wanfang_html.py2
2 files changed, 9 insertions, 1 deletions
diff --git a/scrape/README.md b/scrape/README.md
index bf31fdb..97bb6fe 100644
--- a/scrape/README.md
+++ b/scrape/README.md
@@ -34,3 +34,11 @@ Maybe need to set a referer, something like that?
./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json
./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json
+Download PDFs (without clobbering existing):
+
+ cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {}
+
+ file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c
+ 144 HTML
+ 609 PDF
+
diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py
index 1146528..85187f5 100755
--- a/scrape/parse_wanfang_html.py
+++ b/scrape/parse_wanfang_html.py
@@ -23,7 +23,7 @@ def parse_wanfang_html(wanfang_html):
tag_div = paper_li.find('div', **{'class': 'tag'})
paper = dict(
is_first_issue=is_first_issue,
- info_url="http://subject.med.wanfangdata.com.cn" + title_a['href'],
+ url="http://subject.med.wanfangdata.com.cn" + title_a['href'],
wanfang_id=title_a['href'].split('/')[-1],
title=title_a.get_text().strip(),
journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(),