diff options
Diffstat (limited to 'scrape/parse_wanfang_html.py')
| -rwxr-xr-x | scrape/parse_wanfang_html.py | 43 |
1 files changed, 0 insertions, 43 deletions
diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py deleted file mode 100755 index 85187f5..0000000 --- a/scrape/parse_wanfang_html.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -from bs4 import BeautifulSoup - -def parse_wanfang_html(wanfang_html): - soup = BeautifulSoup(wanfang_html, "lxml") - - papers = [] - papers_ul = soup.find('ul', **{'class': 'item_detail_list'}) - for paper_li in soup.find_all('li'): - if paper_li.get('mark') not in ("32", "34"): - continue - if not paper_li.find('div'): - continue - #print(paper_li) - title_div = paper_li.div - title_a = title_div.find('text').a - is_first_issue = bool(title_div.find('img')) - subtitle_div = title_div.find('div', **{'class': 'subtitle'}) - summary_div = paper_li.find('div', **{'class': 'summary'}) - tag_div = paper_li.find('div', **{'class': 'tag'}) - paper = dict( - is_first_issue=is_first_issue, - url="http://subject.med.wanfangdata.com.cn" + title_a['href'], - wanfang_id=title_a['href'].split('/')[-1], - title=title_a.get_text().strip(), - journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), - date=subtitle_div.find('span', **{'class': None}).get_text().replace('时间:', '').strip(), - #button_text=title_div.button.get_text().strip(), - abstract=summary_div.get_text().strip(), - tag=tag_div['text'] or None, - ) - assert paper['date'].startswith('2020') - papers.append(paper) - return papers - -if __name__ == "__main__": - with open(sys.argv[1], "r") as f: - res = parse_wanfang_html(f.read()) - for paper in res: - print(json.dumps(paper, sort_keys=True)) |
