diff options
| -rw-r--r-- | python/sandcrawler/html.py | 29 | 
1 files changed, 23 insertions, 6 deletions
| diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 14561bf..d3f5cfe 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -138,12 +138,29 @@ def extract_fulltext_url(html_url, html_body):      # https://osf.io/preprints/socarxiv/8phvx/      # wow, they ship total javascript crud! going to just guess download URL      # based on URL for now. Maybe content type header would help? -    if '://osf.io/' in html_url and not '/download' in html_url: -        if not html_url.endswith("/"): -            next_url = html_url+"/download" -        else: -            next_url = html_url+"download" -        return dict(next_url=next_url, technique='osf-by-url') +    OSF_DOMAINS = [ +        '://osf.io/', +        '://biohackrxiv.org/', +        '://psyarxiv.com/', +        '://arabixiv.org/', +        '://engrxiv.org/', +        '://edarxiv.org//', +        '://ecsarxiv.org/', +        '://ecoevorxiv.org/', +        '://frenxiv.org/', +        '://indiarxiv.org/', +        '://mindrxiv.org/', +        '://mediarxiv.org/', +        '://paleorxiv.org/', +        '://thesiscommons.org/', +    ] +    for domain in OSF_DOMAINS: +        if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url: +            if not html_url.endswith("/"): +                next_url = html_url+"/download" +            else: +                next_url = html_url+"download" +            return dict(next_url=next_url, technique='osf-by-url')      # wiley      # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787 | 
