aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/html.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 6e5d2d6..f73b579 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -346,10 +346,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
# Google Drive
# this is assuming it is a PDF
if "drive.google.com/file/d/" in html_url and "/view" in html_url:
- gdrive_id = html_url.split('/')[5]
+ gdrive_id = html_url.split("/")[5]
if len(gdrive_id) > 10:
# https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
- return dict(pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}", technique="google-drive")
+ return dict(
+ pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+ technique="google-drive",
+ )
### below here we are doing guesses