aboutsummaryrefslogtreecommitdiffstats
path: root/scrape
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-03 15:16:17 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-03 15:16:17 -0700
commitfb767adb9472ff85b46b5a383f3986950b12dd27 (patch)
tree724af4412353c627b0eae26fd4d7fd1164bf2b55 /scrape
parent4cbbdf33ee2a9651f79f96e4bf290d8bc721f69d (diff)
downloadfatcat-covid19-fb767adb9472ff85b46b5a383f3986950b12dd27.tar.gz
fatcat-covid19-fb767adb9472ff85b46b5a383f3986950b12dd27.zip
move more directories around
Diffstat (limited to 'scrape')
-rw-r--r--scrape/.gitignore4
-rw-r--r--scrape/README.md44
-rwxr-xr-xscrape/parse_cnki_tables.py52
-rwxr-xr-xscrape/parse_wanfang_html.py43
4 files changed, 0 insertions, 143 deletions
diff --git a/scrape/.gitignore b/scrape/.gitignore
deleted file mode 100644
index b2bc71b..0000000
--- a/scrape/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-fulltext_wanfang/
-fulltext_ckni/
-*.html
-*.json
diff --git a/scrape/README.md b/scrape/README.md
deleted file mode 100644
index 97bb6fe..0000000
--- a/scrape/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-
-
-## CNKI List
-
-Base URL: <http://en.gzbd.cnki.net/GZBT/brief/Default.aspx>
-
-2020-03-29: "Found 1914 articles"
-
-Uses JS to fetch tables, URLs look like:
-
- http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag&contentID=0&orderStr=1&page=1&grouptype=undefined&groupvalue=undefined
-
-Fetch a bunch:
-
- seq 0 64 | parallel http get "http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag\&contentID=0\&orderStr=1\&page={}\&grouptype=undefined\&groupvalue=undefined" > cnki_tables.html
-
-Parse HTML snippets to JSON:
-
- ./parse_cnki_tables.py > cnki_metadata.json
-
-The `info_url` seems to work, but the direct PDF download links don't naively.
-Maybe need to set a referer, something like that?
-
-
-## Wanfang Data
-
- mark=32 指南与共识 Guidelines and consensus
- mark=34 文献速递 Literature Express
- mark=38 中医药防治 Prevention and treatment of traditional Chinese medicine
-
- wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=32' -O wanfang_guidance.2020-03-29.html
- wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=34' -O wanfang_papers.2020-03-29.html
-
- ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json
- ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json
-
-Download PDFs (without clobbering existing):
-
- cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {}
-
- file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c
- 144 HTML
- 609 PDF
-
diff --git a/scrape/parse_cnki_tables.py b/scrape/parse_cnki_tables.py
deleted file mode 100755
index 3763550..0000000
--- a/scrape/parse_cnki_tables.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-from bs4 import BeautifulSoup
-
-def parse_cnki_tables(cnki_html):
- soup = BeautifulSoup(cnki_html, "lxml")
-
- papers = []
- for table in soup.find_all('table'):
- for row in table.tbody.find_all('tr'):
- paper = dict()
- for col in ('seq', 'author', 'date'):
- paper[col] = row.find('td', **{'class': col}).get_text().strip().replace('\n', ' ')
- name_td = row.find('td', **{'class': 'name'})
- operat_td = row.find('td', **{'class': 'operat'})
- paper['title'] = name_td.a.get_text().strip().replace('\n', ' ')
- paper['seq'] = int(paper['seq'])
- paper['authors'] = [a for a in paper.pop('author').split(';') if a]
- mark = row.find('span', **{'class': 'markOricon'})
-
- paper['info_url'] = "http://en.gzbd.cnki.net" + name_td.a['href']
- paper['pdf_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-download'})['href']
- try:
- paper['html_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-html'})['href']
- except TypeError:
- try:
- paper['read_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-read'})['href']
- except TypeError:
- #print(operat_td, file=sys.stderr)
- pass
-
- if 'FileName=' in paper['info_url']:
- params = paper['info_url'].split('?')[1].split('&')
- for p in params:
- if p.startswith("FileName="):
- paper['cnki_id'] = p.replace("FileName=", "")
- break
-
- if mark and mark.get_text() == 'CN':
- paper['is_cn'] = True
- else:
- paper['is_cn'] = False
- papers.append(paper)
- return papers
-
-if __name__ == "__main__":
- with open("cnki_tables.html", "r") as f:
- res = parse_cnki_tables(f.read())
- for paper in res:
- print(json.dumps(paper, sort_keys=True))
diff --git a/scrape/parse_wanfang_html.py b/scrape/parse_wanfang_html.py
deleted file mode 100755
index 85187f5..0000000
--- a/scrape/parse_wanfang_html.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import json
-from bs4 import BeautifulSoup
-
-def parse_wanfang_html(wanfang_html):
- soup = BeautifulSoup(wanfang_html, "lxml")
-
- papers = []
- papers_ul = soup.find('ul', **{'class': 'item_detail_list'})
- for paper_li in soup.find_all('li'):
- if paper_li.get('mark') not in ("32", "34"):
- continue
- if not paper_li.find('div'):
- continue
- #print(paper_li)
- title_div = paper_li.div
- title_a = title_div.find('text').a
- is_first_issue = bool(title_div.find('img'))
- subtitle_div = title_div.find('div', **{'class': 'subtitle'})
- summary_div = paper_li.find('div', **{'class': 'summary'})
- tag_div = paper_li.find('div', **{'class': 'tag'})
- paper = dict(
- is_first_issue=is_first_issue,
- url="http://subject.med.wanfangdata.com.cn" + title_a['href'],
- wanfang_id=title_a['href'].split('/')[-1],
- title=title_a.get_text().strip(),
- journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源:', '').strip(),
- date=subtitle_div.find('span', **{'class': None}).get_text().replace('时间:', '').strip(),
- #button_text=title_div.button.get_text().strip(),
- abstract=summary_div.get_text().strip(),
- tag=tag_div['text'] or None,
- )
- assert paper['date'].startswith('2020')
- papers.append(paper)
- return papers
-
-if __name__ == "__main__":
- with open(sys.argv[1], "r") as f:
- res = parse_wanfang_html(f.read())
- for paper in res:
- print(json.dumps(paper, sort_keys=True))