python: isort all imports

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:22:38 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:22:38 -0700
commit: 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
tree: b7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler
parent: f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
download: sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
18 files changed, 108 insertions, 99 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 4e004be..bf2d92d 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,11 +1,14 @@
 
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient,
+                 SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError)
 from .ingest_file import IngestFileWorker
 from .ingest_fileset import IngestFilesetWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker,
+                      PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
+from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink,
+                      MultiprocessWrapper, ZipfilePusher)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 9b55c0c..4dcdb0e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,12 +1,13 @@
 
-import json
 import datetime
+import json
 from typing import Optional
 
 import psycopg2
 import psycopg2.extras
 import requests
 
+
 class SandcrawlerPostgrestClient:
 
     def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 134ae7c..92fed37 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,18 +1,18 @@
 
-import sys
-import json
 import gzip
+import json
+import sys
 import time
 import urllib.parse
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
-import requests
 import internetarchive
+import requests
 
+from sandcrawler.fileset_types import *
 from sandcrawler.html_metadata import BiblioMetadata
 from sandcrawler.ia import ResourceResult
-from sandcrawler.fileset_types import *
 
 
 class FilesetPlatformHelper():
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index d12fc15..c9f182c 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,18 +1,19 @@
 
+import gzip
+import json
 import os
+import shutil
 import sys
-import json
-import gzip
 import time
-import shutil
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import internetarchive
 
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+                                       PlatformScopeError)
 from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError
+from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
 from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
 
 
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index d7e9d6d..8ea136e 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,9 +1,10 @@
 
 from enum import Enum
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import BaseModel
 
+
 class IngestStrategy(str, Enum):
     WebFile = "web-file"
     WebFileset = "web-fileset"
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b4215dc..5242b3a 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,8 +2,10 @@
 import requests
 
 from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+
 from .misc import gen_file_metadata
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
 
 class GrobidClient(object):
 
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index cd0a8e8..6bdebdd 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,7 +1,7 @@
 
+import json
 import re
 import sys
-import json
 import urllib.parse
 
 from bs4 import BeautifulSoup
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 93c7269..c6725dc 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,16 @@
 
-import sys
 import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
 import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
 
+import braveblock
 import dateparser
-from selectolax.parser import HTMLParser
 import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
 
 from sandcrawler.misc import url_fuzzy_equal
 
-
 # this is a map of metadata keys to CSS selectors
 # sources for this list include:
 #  - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a2ca346..ca1182f 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,29 +3,31 @@
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
+import datetime
+import gzip
+import http.client
+import json
 import os
 import sys
 import time
-import gzip
-import json
-import requests
-import datetime
 import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
 from collections import namedtuple
+from typing import Tuple
 
-import http.client
+import requests
+import urllib3.exceptions
 
 # not sure this will really work. Should go before wayback imports.
 http.client._MAXHEADERS = 1000  # type: ignore
 
-import wayback.exception
 from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
+
+import wayback.exception
 from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
 
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
 
 class SandcrawlerBackoffError(Exception):
     """
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 72d4e14..137a793 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,31 +1,31 @@
 
-import sys
-import json
+import base64
 import gzip
+import json
+import sys
 import time
-import base64
 import xml.etree.ElementTree
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
 from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict, List, Optional, Tuple
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
 from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
-    quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
-    WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+                            fix_transfer_encoding)
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+                                     html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
 from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.xml import xml_reserialize
 
-
 MAX_BODY_SIZE_BYTES = 128*1024*1024
 
 class IngestFileWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 7c0dfbd..11386df 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,29 +1,28 @@
 
-import sys
-import json
 import gzip
+import json
+import sys
 import time
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
-    quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
-    WebResource, html_guess_platform
-
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+                            fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
-
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+                                     html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.workers import SandcrawlerWorker
 
 MAX_BODY_SIZE_BYTES = 128*1024*1024
 
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 56a726d..7e6e5e3 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,20 +1,20 @@
 
+import argparse
+import datetime
 import io
-import sys
 import json
-import datetime
-import argparse
+import sys
 import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
 
-import trafilatura
 import pydantic
+import trafilatura
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
+                            cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
 
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..b617178 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,7 +1,7 @@
 
+import hashlib
 import io
 import os
-import hashlib
 
 import minio
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 37a2a82..cf8c4bd 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,15 +1,15 @@
 
-import os
 import base64
-import magic
-import hashlib
 import datetime
+import hashlib
+import os
 from typing import Optional
 
+import magic
 import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
 import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
 
 
 def clean_url(s: str) -> str:
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9b4e834..2fb34b8 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,17 +1,16 @@
 
-import sys
-import json
 import datetime
-from io import BytesIO
+import json
+import sys
 from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, Optional
 
 import poppler
 from PIL import Image
 
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
 from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 # This is a hack to work around timeouts when processing certain PDFs with
 # poppler. For some reason, the usual Kafka timeout catcher isn't working on
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..7d03357 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,9 +1,10 @@
 
 import time
+
 import requests
 
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
 from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class PdfTrioClient(object):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index af702ca..66a36bc 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,15 +20,15 @@ grobid
 """
 
 import os
-from typing import Optional, AnyStr
 import xml.etree.ElementTree
+from typing import AnyStr, Optional
 
-from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import PdfExtractResult
 from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
 
 
 class PersistCdxWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..d8a4016 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,17 @@
 
-import sys
 import json
-import time
+import multiprocessing.pool
 import signal
+import sys
+import time
 import zipfile
-import requests
-import multiprocessing.pool
 from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
 
+import requests
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError
 from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
 
 
 class SandcrawlerWorker(object):
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:22:38 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:22:38 -0700
commit	3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
tree	b7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler
parent	f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
download	sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip