From 600ad67925a748200ddf21d5aeabd157d2bb3664 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 26 Oct 2021 13:35:36 -0700
Subject: start handling trivial lint cleanups: unused imports, 'is None', etc

---
 python/sandcrawler/ingest_fileset.py | 29 +++++++++--------------------
 1 file changed, 9 insertions(+), 20 deletions(-)

(limited to 'python/sandcrawler/ingest_fileset.py')

diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 5cbb908..4376c89 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,30 +1,19 @@
-import gzip
 import json
 import sys
 import time
-from collections import namedtuple
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.db import SandcrawlerPostgrestClient
-from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
-from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE
 from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_metadata import (BiblioMetadata, html_extract_biblio,
-                                       html_extract_resources, load_adblock_rules)
-from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError,
-                            ResourceResult, SavePageNowClient, SavePageNowError, WaybackClient,
-                            WaybackContentError, WaybackError, cdx_to_dict,
-                            fix_transfer_encoding)
+from sandcrawler.html_metadata import html_extract_biblio
+from sandcrawler.ia import (CdxApiError, PetaboxError, SavePageNowError, WaybackContentError,
+                            WaybackError, cdx_to_dict, fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.ingest_html import (WebResource, fetch_html_resources,
-                                     html_extract_body_teixml, html_guess_platform,
-                                     html_guess_scope, quick_fetch_html_resources)
-from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
-from sandcrawler.workers import SandcrawlerWorker
+from sandcrawler.misc import clean_url, gen_file_metadata
 
 MAX_BODY_SIZE_BYTES = 128 * 1024 * 1024
 
@@ -61,7 +50,7 @@ class IngestFilesetWorker(IngestFileWorker):
             return None
         existing = self.pgrest_client.get_ingest_fileset_result(ingest_type, base_url)
         # TODO: filter on more flags?
-        if existing and existing['hit'] == True:
+        if existing and existing['hit'] is True:
             return existing
         else:
             return None
@@ -196,7 +185,7 @@ class IngestFilesetWorker(IngestFileWorker):
 
             # fetch must be a hit if we got this far (though not necessarily an ingest hit!)
             assert resource
-            assert resource.hit == True
+            assert resource.hit is True
             assert resource.terminal_status_code in (200, 226)
 
             if resource.terminal_url:
-- 
cgit v1.2.3