aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_fileset.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
commit3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
treeb7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler/ingest_fileset.py
parentf67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
downloadsandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
python: isort all imports
Diffstat (limited to 'python/sandcrawler/ingest_fileset.py')
-rw-r--r--python/sandcrawler/ingest_fileset.py31
1 files changed, 15 insertions, 16 deletions
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 7c0dfbd..11386df 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,29 +1,28 @@
-import sys
-import json
import gzip
+import json
+import sys
import time
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
-
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.workers import SandcrawlerWorker
MAX_BODY_SIZE_BYTES = 128*1024*1024