diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-29 19:22:25 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-29 19:22:25 -0700 |
commit | df08aa95da7aeb6db07e56a16083bbe706e17467 (patch) | |
tree | 8bad7216111e39f5d0d7ce90ae15ae4f03020b25 /scrape/parse_cnki_tables.py | |
parent | 33854aad078e1b684c0dc6425958c4bf85c5e728 (diff) | |
download | fatcat-covid19-df08aa95da7aeb6db07e56a16083bbe706e17467.tar.gz fatcat-covid19-df08aa95da7aeb6db07e56a16083bbe706e17467.zip |
first iteration of CNKI and Wanfang scrapers
Diffstat (limited to 'scrape/parse_cnki_tables.py')
-rwxr-xr-x | scrape/parse_cnki_tables.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/scrape/parse_cnki_tables.py b/scrape/parse_cnki_tables.py new file mode 100755 index 0000000..3763550 --- /dev/null +++ b/scrape/parse_cnki_tables.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +import sys +import json +from bs4 import BeautifulSoup + +def parse_cnki_tables(cnki_html): + soup = BeautifulSoup(cnki_html, "lxml") + + papers = [] + for table in soup.find_all('table'): + for row in table.tbody.find_all('tr'): + paper = dict() + for col in ('seq', 'author', 'date'): + paper[col] = row.find('td', **{'class': col}).get_text().strip().replace('\n', ' ') + name_td = row.find('td', **{'class': 'name'}) + operat_td = row.find('td', **{'class': 'operat'}) + paper['title'] = name_td.a.get_text().strip().replace('\n', ' ') + paper['seq'] = int(paper['seq']) + paper['authors'] = [a for a in paper.pop('author').split(';') if a] + mark = row.find('span', **{'class': 'markOricon'}) + + paper['info_url'] = "http://en.gzbd.cnki.net" + name_td.a['href'] + paper['pdf_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-download'})['href'] + try: + paper['html_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-html'})['href'] + except TypeError: + try: + paper['read_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-read'})['href'] + except TypeError: + #print(operat_td, file=sys.stderr) + pass + + if 'FileName=' in paper['info_url']: + params = paper['info_url'].split('?')[1].split('&') + for p in params: + if p.startswith("FileName="): + paper['cnki_id'] = p.replace("FileName=", "") + break + + if mark and mark.get_text() == 'CN': + paper['is_cn'] = True + else: + paper['is_cn'] = False + papers.append(paper) + return papers + +if __name__ == "__main__": + with open("cnki_tables.html", "r") as f: + res = parse_cnki_tables(f.read()) + for paper in res: + print(json.dumps(paper, sort_keys=True)) |