From fb767adb9472ff85b46b5a383f3986950b12dd27 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 3 Apr 2020 15:16:17 -0700
Subject: move more directories around

---
 extra/scrape/.gitignore            |  4 +++
 extra/scrape/README.md             | 44 ++++++++++++++++++++++++++++++++
 extra/scrape/parse_cnki_tables.py  | 52 ++++++++++++++++++++++++++++++++++++++
 extra/scrape/parse_wanfang_html.py | 43 +++++++++++++++++++++++++++++++
 4 files changed, 143 insertions(+)
 create mode 100644 extra/scrape/.gitignore
 create mode 100644 extra/scrape/README.md
 create mode 100755 extra/scrape/parse_cnki_tables.py
 create mode 100755 extra/scrape/parse_wanfang_html.py

(limited to 'extra/scrape')

diff --git a/extra/scrape/.gitignore b/extra/scrape/.gitignore
new file mode 100644
index 0000000..b2bc71b
--- /dev/null
+++ b/extra/scrape/.gitignore
@@ -0,0 +1,4 @@
+fulltext_wanfang/
+fulltext_ckni/
+*.html
+*.json
diff --git a/extra/scrape/README.md b/extra/scrape/README.md
new file mode 100644
index 0000000..97bb6fe
--- /dev/null
+++ b/extra/scrape/README.md
@@ -0,0 +1,44 @@
+
+
+## CNKI List
+
+Base URL: <http://en.gzbd.cnki.net/GZBT/brief/Default.aspx>
+
+2020-03-29: "Found 1914 articles"
+
+Uses JS to fetch tables, URLs look like:
+
+    http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag&contentID=0&orderStr=1&page=1&grouptype=undefined&groupvalue=undefined
+
+Fetch a bunch:
+
+    seq 0 64 | parallel http get "http://en.gzbd.cnki.net/gzbt/request/otherhandler.ashx?action=gzbdFlag\&contentID=0\&orderStr=1\&page={}\&grouptype=undefined\&groupvalue=undefined" > cnki_tables.html
+
+Parse HTML snippets to JSON:
+
+    ./parse_cnki_tables.py > cnki_metadata.json
+
+The `info_url` seems to work, but the direct PDF download links don't naively.
+Maybe need to set a referer, something like that?
+
+
+## Wanfang Data
+
+    mark=32 指南与共识 Guidelines and consensus
+    mark=34 文献速递 Literature Express
+    mark=38 中医药防治 Prevention and treatment of traditional Chinese medicine
+
+    wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=32' -O wanfang_guidance.2020-03-29.html
+    wget 'http://subject.med.wanfangdata.com.cn/Channel/7?mark=34' -O wanfang_papers.2020-03-29.html
+
+    ./parse_wanfang_html.py wanfang_papers.2020-03-29.html > wanfang_papers.2020-03-29.json
+    ./parse_wanfang_html.py wanfang_guidance.2020-03-29.html > wanfang_guidance.2020-03-29.json
+
+Download PDFs (without clobbering existing):
+
+    cat wanfang_papers.2020-03-29.json wanfang_guidance.2020-03-29.json | jq .url -r | parallel wget -P fulltext_wanfang --no-clobber {}
+
+    file fulltext_wanfang/* | cut -f2 -d' ' | sort | uniq -c
+        144 HTML
+        609 PDF
+
diff --git a/extra/scrape/parse_cnki_tables.py b/extra/scrape/parse_cnki_tables.py
new file mode 100755
index 0000000..3763550
--- /dev/null
+++ b/extra/scrape/parse_cnki_tables.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+from bs4 import BeautifulSoup
+
+def parse_cnki_tables(cnki_html):
+    soup = BeautifulSoup(cnki_html, "lxml")
+
+    papers = []
+    for table in soup.find_all('table'):
+        for row in table.tbody.find_all('tr'):
+            paper = dict()
+            for col in ('seq', 'author', 'date'):
+                paper[col] = row.find('td', **{'class': col}).get_text().strip().replace('\n', ' ')
+            name_td = row.find('td', **{'class': 'name'})
+            operat_td = row.find('td', **{'class': 'operat'})
+            paper['title'] = name_td.a.get_text().strip().replace('\n', ' ')
+            paper['seq'] = int(paper['seq'])
+            paper['authors'] = [a for a in paper.pop('author').split(';') if a]
+            mark = row.find('span', **{'class': 'markOricon'})
+
+            paper['info_url'] = "http://en.gzbd.cnki.net" + name_td.a['href']
+            paper['pdf_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-download'})['href']
+            try:
+                paper['html_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-html'})['href']
+            except TypeError:
+                try:
+                    paper['read_url'] = "http://en.gzbd.cnki.net" + operat_td.find('a', **{'class': 'icon-read'})['href']
+                except TypeError:
+                    #print(operat_td, file=sys.stderr)
+                    pass
+
+            if 'FileName=' in paper['info_url']:
+                params = paper['info_url'].split('?')[1].split('&')
+                for p in params:
+                    if p.startswith("FileName="):
+                        paper['cnki_id'] = p.replace("FileName=", "")
+                        break
+
+            if mark and mark.get_text() == 'CN':
+                paper['is_cn'] = True
+            else:
+                paper['is_cn'] = False
+            papers.append(paper)
+    return papers
+
+if __name__ == "__main__":
+    with open("cnki_tables.html", "r") as f:
+        res = parse_cnki_tables(f.read())
+        for paper in res:
+            print(json.dumps(paper, sort_keys=True))
diff --git a/extra/scrape/parse_wanfang_html.py b/extra/scrape/parse_wanfang_html.py
new file mode 100755
index 0000000..85187f5
--- /dev/null
+++ b/extra/scrape/parse_wanfang_html.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+import sys
+import json
+from bs4 import BeautifulSoup
+
+def parse_wanfang_html(wanfang_html):
+    soup = BeautifulSoup(wanfang_html, "lxml")
+
+    papers = []
+    papers_ul = soup.find('ul', **{'class': 'item_detail_list'})
+    for paper_li in soup.find_all('li'):
+        if paper_li.get('mark') not in ("32", "34"):
+            continue
+        if not paper_li.find('div'):
+            continue
+        #print(paper_li)
+        title_div = paper_li.div
+        title_a = title_div.find('text').a
+        is_first_issue = bool(title_div.find('img'))
+        subtitle_div = title_div.find('div', **{'class': 'subtitle'})
+        summary_div = paper_li.find('div', **{'class': 'summary'})
+        tag_div = paper_li.find('div', **{'class': 'tag'})
+        paper = dict(
+            is_first_issue=is_first_issue,
+            url="http://subject.med.wanfangdata.com.cn" + title_a['href'],
+            wanfang_id=title_a['href'].split('/')[-1],
+            title=title_a.get_text().strip(),
+            journal=subtitle_div.find('span', **{'class': 'origin'}).get_text().replace('来源：', '').strip(),
+            date=subtitle_div.find('span', **{'class': None}).get_text().replace('时间：', '').strip(),
+            #button_text=title_div.button.get_text().strip(),
+            abstract=summary_div.get_text().strip(),
+            tag=tag_div['text'] or None,
+        )
+        assert paper['date'].startswith('2020')
+        papers.append(paper)
+    return papers
+
+if __name__ == "__main__":
+    with open(sys.argv[1], "r") as f:
+        res = parse_wanfang_html(f.read())
+        for paper in res:
+            print(json.dumps(paper, sort_keys=True))
-- 
cgit v1.2.3