diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:16:45 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-04-26 15:16:45 -0700 |
commit | f8f41070adfd4dc4856d96f618ee96fe8c411458 (patch) | |
tree | e1640e62d4ff5df3f871c61e062a2576340f57ea | |
parent | ddcab9f3f44fd921bb70021674fb4052efd604f0 (diff) | |
download | sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.tar.gz sandcrawler-f8f41070adfd4dc4856d96f618ee96fe8c411458.zip |
make fmt
-rw-r--r-- | python/sandcrawler/html.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 6e5d2d6..f73b579 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -346,10 +346,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]: # Google Drive # this is assuming it is a PDF if "drive.google.com/file/d/" in html_url and "/view" in html_url: - gdrive_id = html_url.split('/')[5] + gdrive_id = html_url.split("/")[5] if len(gdrive_id) > 10: # https://drive.google.com/uc?export=download&id=15DnbNMZTbRHHqKj8nFaikGSd1-OyoJ24 - return dict(pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}", technique="google-drive") + return dict( + pdf_url=f"https://drive.google.com/uc?export=download&id={gdrive_id}", + technique="google-drive", + ) ### below here we are doing guesses |