diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-05-21 16:48:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-05-21 16:48:29 -0700 |
commit | 071af9a4832dcb24be417de9b658d678056b5bf2 (patch) | |
tree | 9dc510a65df81a3dc334e3e31623aec4ab9ca31a | |
parent | add030c447fbc2d1a91b0756c395737aa796fef0 (diff) | |
download | sandcrawler-071af9a4832dcb24be417de9b658d678056b5bf2.tar.gz sandcrawler-071af9a4832dcb24be417de9b658d678056b5bf2.zip |
better OSF preprint download re-writing
-rw-r--r-- | python/sandcrawler/html.py | 29 |
1 files changed, 23 insertions, 6 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 14561bf..d3f5cfe 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -138,12 +138,29 @@ def extract_fulltext_url(html_url, html_body): # https://osf.io/preprints/socarxiv/8phvx/ # wow, they ship total javascript crud! going to just guess download URL # based on URL for now. Maybe content type header would help? - if '://osf.io/' in html_url and not '/download' in html_url: - if not html_url.endswith("/"): - next_url = html_url+"/download" - else: - next_url = html_url+"download" - return dict(next_url=next_url, technique='osf-by-url') + OSF_DOMAINS = [ + '://osf.io/', + '://biohackrxiv.org/', + '://psyarxiv.com/', + '://arabixiv.org/', + '://engrxiv.org/', + '://edarxiv.org//', + '://ecsarxiv.org/', + '://ecoevorxiv.org/', + '://frenxiv.org/', + '://indiarxiv.org/', + '://mindrxiv.org/', + '://mediarxiv.org/', + '://paleorxiv.org/', + '://thesiscommons.org/', + ] + for domain in OSF_DOMAINS: + if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url: + if not html_url.endswith("/"): + next_url = html_url+"/download" + else: + next_url = html_url+"download" + return dict(next_url=next_url, technique='osf-by-url') # wiley # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787 |