diff options
-rw-r--r-- | scrape/README.md | 8 | ||||
-rwxr-xr-x | scrape/parse_wanfang_html.py | 2 |
2 files changed, 9 insertions, 1 deletions
diff --git a/scrape/README.md b/scrape/README.md index bf31fdb..97bb6fe 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -34,3 +34,11 @@ Maybe need to set a referer, something like that? ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json +Download PDFs (without clobbering existing): + + cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {} + + file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c + 144 HTML + 609 PDF + diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py index 1146528..85187f5 100755 --- a/scrape/parse_wanfang_html.py +++ b/scrape/parse_wanfang_html.py @@ -23,7 +23,7 @@ def parse_wanfang_html(wanfang_html): tag_div = paper_li.find('div', **{'class': 'tag'}) paper = dict( is_first_issue=is_first_issue, - info_url="http://subject.med.wanfangdata.com.cn" + title_a['href'], + url="http://subject.med.wanfangdata.com.cn" + title_a['href'], wanfang_id=title_a['href'].split('/')[-1], title=title_a.get_text().strip(), journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), |