From 9d81f6e3f8a4b300c18a831e80880a8e181f812f Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 8 Aug 2020 16:55:27 -0700 Subject: add more HTML extraction tricks --- python/sandcrawler/html.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 0e64c45..85d32c0 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -54,6 +54,8 @@ def extract_fulltext_url(html_url, html_body): meta = soup.find('meta', attrs={"name":"citation_pdf_url"}) if not meta: meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"}) + if not meta: + meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"}) if not meta: # researchgate does this; maybe others also? meta = soup.find('meta', attrs={"property":"citation_pdf_url"}) @@ -63,12 +65,19 @@ def extract_fulltext_url(html_url, html_body): # wiley has a weird almost-blank page we don't want to loop on if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url: url = meta['content'].strip() - if url.startswith('/'): + if '://doi.org/' in url: + print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) + elif url.startswith('/'): return dict(pdf_url=host_prefix+url, technique='citation_pdf_url') elif url.startswith('http'): return dict(pdf_url=url, technique='citation_pdf_url') else: - print("malformed citation_pdf_url? {}".format(url), file=sys.stderr) + print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr) + + meta = soup.find('meta', attrs={"name":"generator"}) + meta_generator = None + if meta and meta.get('content'): + meta_generator = meta['content'].strip() # sage, and also utpjournals (see below) # https://journals.sagepub.com/doi/10.1177/2309499019888836 @@ -345,6 +354,24 @@ def extract_fulltext_url(html_url, html_body): if record_id.isdigit() and url.encode('utf-8') in html_body: return dict(pdf_url=url, technique='rwth-aachen-url') + # physchemaspects.ru + if '://physchemaspects.ru/' in html_url and soup: + for href in soup.find_all('a'): + if href.text == "download PDF file": + url = href['href'] + if url.startswith('/'): + url = host_prefix + url + return dict(pdf_url=url, technique='physchemaspects-href') + + # OJS 3 (some) + if meta_generator and meta_generator.startswith("Open Journal Systems"): + href = soup.find('a', attrs={"class":"obj_galley_link file"}) + if href and href.text and "pdf" in href.text.lower(): + url = href['href'].strip() + if url.startswith('/'): + url = host_prefix + url + return dict(pdf_url=url, technique='ojs-galley-href') + ### below here we are doing guesses # generic guess: try current URL plus .pdf, if it exists in the HTML body -- cgit v1.2.3