diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-30 09:49:04 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-30 09:49:04 -0700 |
commit | 0a2c5e5c71d920cd2e7634040561a044d9e40d58 (patch) | |
tree | 93d5d4be52b54dabcf28384b33ff6705fdc1323c | |
parent | 0cf608debcd672f9a3c54cb8d4ac1caf686ce2e3 (diff) | |
download | fatcat-covid19-0a2c5e5c71d920cd2e7634040561a044d9e40d58.tar.gz fatcat-covid19-0a2c5e5c71d920cd2e7634040561a044d9e40d58.zip |
update wanfang scrape
-rw-r--r-- | scrape/README.md | 8 | ||||
-rwxr-xr-x | scrape/parse_wanfang_html.py | 2 |
2 files changed, 9 insertions, 1 deletions
diff --git a/scrape/README.md b/scrape/README.md index bf31fdb..97bb6fe 100644 --- a/scrape/README.md +++ b/scrape/README.md @@ -34,3 +34,11 @@ Maybe need to set a referer, something like that? ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json +Download PDFs (without clobbering existing): + + cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {} + + file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c + 144 HTML + 609 PDF + diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py index 1146528..85187f5 100755 --- a/scrape/parse_wanfang_html.py +++ b/scrape/parse_wanfang_html.py @@ -23,7 +23,7 @@ def parse_wanfang_html(wanfang_html): tag_div = paper_li.find('div', **{'class': 'tag'}) paper = dict( is_first_issue=is_first_issue, - info_url="http://subject.med.wanfangdata.com.cn" + title_a['href'], + url="http://subject.med.wanfangdata.com.cn" + title_a['href'], wanfang_id=title_a['href'].split('/')[-1], title=title_a.get_text().strip(), journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), |