From 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:22:38 -0700 Subject: python: isort all imports --- python/sandcrawler/ingest_fileset.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'python/sandcrawler/ingest_fileset.py') diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 7c0dfbd..11386df 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -1,29 +1,28 @@ -import sys -import json import gzip +import json +import sys import time from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime -from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform - -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules -from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient +from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper +from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy +from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError +from sandcrawler.html import extract_fulltext_url +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) from sandcrawler.ingest_file import IngestFileWorker -from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE -from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE -from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError - +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.workers import SandcrawlerWorker MAX_BODY_SIZE_BYTES = 128*1024*1024 -- cgit v1.2.3