diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-08-08 16:55:27 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-08 17:06:02 -0700 | 
| commit | 9d81f6e3f8a4b300c18a831e80880a8e181f812f (patch) | |
| tree | cc71902c988a36c00f8a149364d456c9be4bfeb2 | |
| parent | c19b73f13b021a6d3026d0526b7dfa7a9fdda3a6 (diff) | |
| download | sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.tar.gz sandcrawler-9d81f6e3f8a4b300c18a831e80880a8e181f812f.zip | |
add more HTML extraction tricks
| -rw-r--r-- | python/sandcrawler/html.py | 31 | 
1 files changed, 29 insertions, 2 deletions
| diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 0e64c45..85d32c0 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -55,6 +55,8 @@ def extract_fulltext_url(html_url, html_body):      if not meta:          meta = soup.find('meta', attrs={"name":"bepress_citation_pdf_url"})      if not meta: +        meta = soup.find('meta', attrs={"name":"wkhealth_pdf_url"}) +    if not meta:          # researchgate does this; maybe others also?          meta = soup.find('meta', attrs={"property":"citation_pdf_url"})      # if tag is only partially populated @@ -63,12 +65,19 @@ def extract_fulltext_url(html_url, html_body):      # wiley has a weird almost-blank page we don't want to loop on      if meta and not "://onlinelibrary.wiley.com/doi/pdf/" in html_url:          url = meta['content'].strip() -        if url.startswith('/'): +        if '://doi.org/' in url: +            print(f"\tdoi.org in citation_pdf_url (loop?): {url}", file=sys.stderr) +        elif url.startswith('/'):              return dict(pdf_url=host_prefix+url, technique='citation_pdf_url')          elif url.startswith('http'):              return dict(pdf_url=url, technique='citation_pdf_url')          else: -            print("malformed citation_pdf_url? {}".format(url), file=sys.stderr) +            print("\tmalformed citation_pdf_url? {}".format(url), file=sys.stderr) + +    meta = soup.find('meta', attrs={"name":"generator"}) +    meta_generator = None +    if meta and meta.get('content'): +        meta_generator = meta['content'].strip()      # sage, and also utpjournals (see below)      # https://journals.sagepub.com/doi/10.1177/2309499019888836 @@ -345,6 +354,24 @@ def extract_fulltext_url(html_url, html_body):          if record_id.isdigit() and url.encode('utf-8') in html_body:              return dict(pdf_url=url, technique='rwth-aachen-url') +    # physchemaspects.ru +    if '://physchemaspects.ru/' in html_url and soup: +        for href in soup.find_all('a'): +            if href.text == "download PDF file": +                url = href['href'] +                if url.startswith('/'): +                    url = host_prefix + url +                return dict(pdf_url=url, technique='physchemaspects-href') + +    # OJS 3 (some) +    if meta_generator and meta_generator.startswith("Open Journal Systems"): +        href = soup.find('a', attrs={"class":"obj_galley_link file"}) +        if href and href.text and "pdf" in href.text.lower(): +            url = href['href'].strip() +            if url.startswith('/'): +                url = host_prefix + url +            return dict(pdf_url=url, technique='ojs-galley-href') +      ### below here we are doing guesses      # generic guess: try current URL plus .pdf, if it exists in the HTML body | 
