diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:16:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-03 15:16:17 -0700 |
commit | fb767adb9472ff85b46b5a383f3986950b12dd27 (patch) | |
tree | 724af4412353c627b0eae26fd4d7fd1164bf2b55 /scrape | |
parent | 4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d (diff) | |
download | fatcat-covid19-fb767adb9472ff85b46b5a383f3986950b12dd27.tar.gz fatcat-covid19-fb767adb9472ff85b46b5a383f3986950b12dd27.zip |
move more directories around
Diffstat (limited to 'scrape')
-rw-r--r-- | scrape/.gitignore | 4 | ||||
-rw-r--r-- | scrape/README.md | 44 | ||||
-rwxr-xr-x | scrape/parse_cnki_tables.py | 52 | ||||
-rwxr-xr-x | scrape/parse_wanfang_html.py | 43 |
4 files changed, 0 insertions, 143 deletions
diff --git a/scrape/.gitignore b/scrape/.gitignore deleted file mode 100644 index b2bc71b..0000000 --- a/scrape/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -fulltext_wanfang/ -fulltext_ckni/ -*.html -*.json diff --git a/scrape/README.md b/scrape/README.md deleted file mode 100644 index 97bb6fe..0000000 --- a/scrape/README.md +++ /dev/null @@ -1,44 +0,0 @@ - - -## CNKI List - -Base URL: <http://en.gzbd.cnki.net/GZBT/brief/Default.aspx> - -2020-03-29: "Found 1914 articles" - -Uses JS to fetch tables, URLs look like: - - http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag&contentID=0&orderStr=1&page=1&grouptype=undefined&groupvalue=undefined - -Fetch a bunch: - - seq 0 64 | parallel http get "http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag\&contentID=0\&orderStr=1\&page={}\&grouptype=undefined\&groupvalue=undefined" > cnki_tables.html - -Parse HTML snippets to JSON: - - ./parse_cnki_tables.py > cnki_metadata.json - -The `info_url` seems to work, but the direct PDF download links don't naively. -Maybe need to set a referer, something like that? - - -## Wanfang Data - - mark=32 指南与共识 Guidelines and consensus - mark=34 文献速递 Literature Express - mark=38 中医药防治 Prevention and treatment of traditional Chinese medicine - - wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=32' -O wanfang_guidance.2020-03-29.html - wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=34' -O wanfang_papers.2020-03-29.html - - ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json - ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json - -Download PDFs (without clobbering existing): - - cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {} - - file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c - 144 HTML - 609 PDF - diff --git a/scrape/parse_cnki_tables.py b/scrape/parse_cnki_tables.py deleted file mode 100755 index 3763550..0000000 --- a/scrape/parse_cnki_tables.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -from bs4 import BeautifulSoup - -def parse_cnki_tables(cnki_html): - soup = BeautifulSoup(cnki_html, "lxml") - - papers = [] - for table in soup.find_all('table'): - for row in table.tbody.find_all('tr'): - paper = dict() - for col in ('seq', 'author', 'date'): - paper[col] = row.find('td', **{'class': col}).get_text().strip().replace('\n', ' ') - name_td = row.find('td', **{'class': 'name'}) - operat_td = row.find('td', **{'class': 'operat'}) - paper['title'] = name_td.a.get_text().strip().replace('\n', ' ') - paper['seq'] = int(paper['seq']) - paper['authors'] = [a for a in paper.pop('author').split(';') if a] - mark = row.find('span', **{'class': 'markOricon'}) - - paper['info_url'] = "http://en.gzbd.cnki.net" + name_td.a['href'] - paper['pdf_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-download'})['href'] - try: - paper['html_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-html'})['href'] - except TypeError: - try: - paper['read_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-read'})['href'] - except TypeError: - #print(operat_td, file=sys.stderr) - pass - - if 'FileName=' in paper['info_url']: - params = paper['info_url'].split('?')[1].split('&') - for p in params: - if p.startswith("FileName="): - paper['cnki_id'] = p.replace("FileName=", "") - break - - if mark and mark.get_text() == 'CN': - paper['is_cn'] = True - else: - paper['is_cn'] = False - papers.append(paper) - return papers - -if __name__ == "__main__": - with open("cnki_tables.html", "r") as f: - res = parse_cnki_tables(f.read()) - for paper in res: - print(json.dumps(paper, sort_keys=True)) diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py deleted file mode 100755 index 85187f5..0000000 --- a/scrape/parse_wanfang_html.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -from bs4 import BeautifulSoup - -def parse_wanfang_html(wanfang_html): - soup = BeautifulSoup(wanfang_html, "lxml") - - papers = [] - papers_ul = soup.find('ul', **{'class': 'item_detail_list'}) - for paper_li in soup.find_all('li'): - if paper_li.get('mark') not in ("32", "34"): - continue - if not paper_li.find('div'): - continue - #print(paper_li) - title_div = paper_li.div - title_a = title_div.find('text').a - is_first_issue = bool(title_div.find('img')) - subtitle_div = title_div.find('div', **{'class': 'subtitle'}) - summary_div = paper_li.find('div', **{'class': 'summary'}) - tag_div = paper_li.find('div', **{'class': 'tag'}) - paper = dict( - is_first_issue=is_first_issue, - url="http://subject.med.wanfangdata.com.cn" + title_a['href'], - wanfang_id=title_a['href'].split('/')[-1], - title=title_a.get_text().strip(), - journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(), - date=subtitle_div.find('span', **{'class': None}).get_text().replace('时间:', '').strip(), - #button_text=title_div.button.get_text().strip(), - abstract=summary_div.get_text().strip(), - tag=tag_div['text'] or None, - ) - assert paper['date'].startswith('2020') - papers.append(paper) - return papers - -if __name__ == "__main__": - with open(sys.argv[1], "r") as f: - res = parse_wanfang_html(f.read()) - for paper in res: - print(json.dumps(paper, sort_keys=True)) |