From 071af9a4832dcb24be417de9b658d678056b5bf2 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 21 May 2021 16:48:29 -0700
Subject: better OSF preprint download re-writing

---
 python/sandcrawler/html.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

(limited to 'python/sandcrawler/html.py')

diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 14561bf..d3f5cfe 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -138,12 +138,29 @@ def extract_fulltext_url(html_url, html_body):
     # https://osf.io/preprints/socarxiv/8phvx/
     # wow, they ship total javascript crud! going to just guess download URL
     # based on URL for now. Maybe content type header would help?
-    if '://osf.io/' in html_url and not '/download' in html_url:
-        if not html_url.endswith("/"):
-            next_url = html_url+"/download"
-        else:
-            next_url = html_url+"download"
-        return dict(next_url=next_url, technique='osf-by-url')
+    OSF_DOMAINS = [
+        '://osf.io/',
+        '://biohackrxiv.org/',
+        '://psyarxiv.com/',
+        '://arabixiv.org/',
+        '://engrxiv.org/',
+        '://edarxiv.org//',
+        '://ecsarxiv.org/',
+        '://ecoevorxiv.org/',
+        '://frenxiv.org/',
+        '://indiarxiv.org/',
+        '://mindrxiv.org/',
+        '://mediarxiv.org/',
+        '://paleorxiv.org/',
+        '://thesiscommons.org/',
+    ]
+    for domain in OSF_DOMAINS:
+        if domain in html_url and (len(html_url.split('/')) in [4,5] or '/preprints/' in html_url) and '/download' not in html_url:
+            if not html_url.endswith("/"):
+                next_url = html_url+"/download"
+            else:
+                next_url = html_url+"download"
+            return dict(next_url=next_url, technique='osf-by-url')
 
     # wiley
     # https://onlinelibrary.wiley.com/doi/pdf/10.1111/1467-923X.12787
-- 
cgit v1.2.3