make fmt

author: Bryan Newbold <bnewbold@archive.org> 2022-04-26 15:16:45 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-04-26 15:16:45 -0700
commit: f8f41070adfd4dc4856d96f618ee96fe8c411458 (patch)
tree: e1640e62d4ff5df3f871c61e062a2576340f57ea
parent: ddcab9f3f44fd921bb70021674fb4052efd604f0 (diff)
download: sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.tar.gz
sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.zip
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 6e5d2d6..f73b579 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -346,10 +346,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
     # Google Drive
     # this is assuming it is a PDF
     if "drive.google.com/file/d/" in html_url and "/view" in html_url:
-        gdrive_id = html_url.split('/')[5]
+        gdrive_id = html_url.split("/")[5]
         if len(gdrive_id) > 10:
             # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24
-            return dict(pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}", technique="google-drive")
+            return dict(
+                pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}",
+                technique="google-drive",
+            )
 
     ### below here we are doing guesses
author	Bryan Newbold <bnewbold@archive.org>	2022-04-26 15:16:45 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-04-26 15:16:45 -0700
commit	f8f41070adfd4dc4856d96f618ee96fe8c411458 (patch)
tree	e1640e62d4ff5df3f871c61e062a2576340f57ea
parent	ddcab9f3f44fd921bb70021674fb4052efd604f0 (diff)
download	sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.tar.gz sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.zip