diff options
Diffstat (limited to 'extra')
-rw-r--r-- | extra/scrape/.gitignore | 4 | ||||
-rw-r--r-- | extra/scrape/README.md | 44 | ||||
-rwxr-xr-x | extra/scrape/parse_cnki_tables.py | 52 | ||||
-rwxr-xr-x | extra/scrape/parse_wanfang_html.py | 43 |
4 files changed, 143 insertions, 0 deletions
diff --git a/extra/scrape/.gitignore b/extra/scrape/.gitignore new file mode 100644 index 0000000..b2bc71b --- /dev/null +++ b/extra/scrape/.gitignore @@ -0,0 +1,4 @@ +fulltext_wanfang/ +fulltext_ckni/ +*.html +*.json diff --git a/extra/scrape/README.md b/extra/scrape/README.md new file mode 100644 index 0000000..97bb6fe --- /dev/null +++ b/extra/scrape/README.md @@ -0,0 +1,44 @@ + + +## CNKI List + +Base URL: <http://en.gzbd.cnki.net/GZBT/brief/Default.aspx> + +2020-03-29: "Found 1914 articles" + +Uses JS to fetch tables, URLs look like: + + http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag&contentID=0&orderStr=1&page=1&grouptype=undefined&groupvalue=undefined + +Fetch a bunch: + + seq 0 64 | parallel http get "http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag\&contentID=0\&orderStr=1\&page={}\&grouptype=undefined\&groupvalue=undefined" > cnki_tables.html + +Parse HTML snippets to JSON: + + ./parse_cnki_tables.py > cnki_metadata.json + +The `info_url` seems to work, but the direct PDF download links don't naively. +Maybe need to set a referer, something like that? + + +## Wanfang Data + + mark=32 指南与共识 Guidelines and consensus + mark=34 文献速递 Literature Express + mark=38 中医药防治 Prevention and treatment of traditional Chinese medicine + + wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=32' -O wanfang_guidance.2020-03-29.html + wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=34' -O wanfang_papers.2020-03-29.html + + ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json + ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json + +Download PDFs (without clobbering existing): + + cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {} + + file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c + 144 HTML + 609 PDF + diff --git a/extra/scrape/parse_cnki_tables.py b/extra/scrape/parse_cnki_tables.py new file mode 100755 index 0000000..3763550 --- /dev/null +++ b/extra/scrape/parse_cnki_tables.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import sys +import json +from bs4 import BeautifulSoup + +def parse_cnki_tables(cnki_html): + soup = BeautifulSoup(cnki_html, "lxml") + + papers = [] + for table in soup.find_all('table'): + for row in table.tbody.find_all('tr'): + paper = dict() + for col in ('seq', 'author', 'date'): + paper[col] = row.find('td', **{'class': col}).get_text().strip().replace('\n', ' ') + name_td = row.find('td', **{'class': 'name'}) + operat_td = row.find('td', **{'class': 'operat'}) + paper['title'] = name_td.a.get_text().strip().replace('\n', ' ') + paper['seq'] = int(paper['seq']) + paper['authors'] = [a for a in paper.pop('author').split(';') if a] + mark = row.find('span', **{'class': 'markOricon'}) + + paper['info_url'] = "http://en.gzbd.cnki.net" + name_td.a['href'] + paper['pdf_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-download'})['href'] + try: + paper['html_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-html'})['href'] + except TypeError: + try: + paper['read_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-read'})['href'] + except TypeError: + #print(operat_td, file=sys.stderr) + pass + + if 'FileName=' in paper['info_url']: + params = paper['info_url'].split('?')[1].split('&') + for p in params: + if p.startswith("FileName="): + paper['cnki_id'] = p.replace("FileName=", "") + break + + if mark and mark.get_text() == 'CN': + paper['is_cn'] = True + else: + paper['is_cn'] = False + papers.append(paper) + return papers + +if __name__ == "__main__": + with open("cnki_tables.html", "r") as f: + res = parse_cnki_tables(f.read()) + for paper in res: + print(json.dumps(paper, sort_keys=True)) diff --git a/extra/scrape/parse_wanfang_html.py b/extra/scrape/parse_wanfang_html.py new file mode 100755 index 0000000..85187f5 --- /dev/null +++ b/extra/scrape/parse_wanfang_html.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +import sys +import json +from bs4 import BeautifulSoup + +def parse_wanfang_html(wanfang_html): + soup = BeautifulSoup(wanfang_html, "lxml") + + papers = [] + papers_ul = soup.find('ul', **{'class': 'item_detail_list'}) + for paper_li in soup.find_all('li'): + if paper_li.get('mark') not in ("32", "34"): + continue + if not paper_li.find('div'): + continue + #print(paper_li) + title_div = paper_li.div + title_a = title_div.find('text').a + is_first_issue = bool(title_div.find('img')) + subtitle_div = title_div.find('div', **{'class': 'subtitle'}) + summary_div = paper_li.find('div', **{'class': 'summary'}) + tag_div = paper_li.find('div', **{'class': 'tag'}) + paper = dict( + is_first_issue=is_first_issue, + url="http://subject.med.wanfangdata.com.cn" + title_a['href'], + wanfang_id=title_a['href'].split('/')[-1], + title=title_a.get_text().strip(), + journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), + date=subtitle_div.find('span', **{'class': None}).get_text().replace('时间:', '').strip(), + #button_text=title_div.button.get_text().strip(), + abstract=summary_div.get_text().strip(), + tag=tag_div['text'] or None, + ) + assert paper['date'].startswith('2020') + papers.append(paper) + return papers + +if __name__ == "__main__": + with open(sys.argv[1], "r") as f: + res = parse_wanfang_html(f.read()) + for paper in res: + print(json.dumps(paper, sort_keys=True)) |