aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-21 16:48:29 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-21 16:48:29 -0700
commit071af9a4832dcb24be417de9b658d678056b5bf2 (patch)
tree9dc510a65df81a3dc334e3e31623aec4ab9ca31a
parentadd030c447fbc2d1a91b0756c395737aa796fef0 (diff)
downloadsandcrawler-071af9a4832dcb24be417de9b658d678056b5bf2.tar.gz
sandcrawler-071af9a4832dcb24be417de9b658d678056b5bf2.zip
better OSF preprint download re-writing
-rw-r--r--python/sandcrawler/html.py29
1 files changed, 23 insertions, 6 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 14561bf..d3f5cfe 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -138,12 +138,29 @@ def extract_fulltext_url(html_url, html_body):
# https://osf.io/preprints/socarxiv/8phvx/
# wow, they ship total javascript crud! going to just guess download URL
# based on URL for now. Maybe content type header would help?
- if '://osf.io/' in html_url and not '/download' in html_url:
- if not html_url.endswith("/"):
- next_url = html_url+"/download"
- else:
- next_url = html_url+"download"
- return dict(next_url=next_url, technique='osf-by-url')
+ OSF_DOMAINS = [
+ '://osf.io/',
+ '://biohackrxiv.org/',
+ '://psyarxiv.com/',
+ '://arabixiv.org/',
+ '://engrxiv.org/',
+ '://edarxiv.org//',
+ '://ecsarxiv.org/',
+ '://ecoevorxiv.org/',
+ '://frenxiv.org/',
+ '://indiarxiv.org/',
+ '://mindrxiv.org/',
+ '://mediarxiv.org/',
+ '://paleorxiv.org/',
+ '://thesiscommons.org/',
+ ]
+ for domain in OSF_DOMAINS:
+ if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url:
+ if not html_url.endswith("/"):
+ next_url = html_url+"/download"
+ else:
+ next_url = html_url+"download"
+ return dict(next_url=next_url, technique='osf-by-url')
# wiley
# https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787