python: isort all imports

author: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:22:38 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-26 12:22:38 -0700
commit: 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
tree: b7e7e27ff2032c99fd782b3ea40daf1d12f9164e
parent: f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
download: sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
57 files changed, 207 insertions, 178 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index a22d47d..b4bfe2b 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -23,11 +23,11 @@ A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
 Prints JSON to stdout, errors to stderr
 """
 
+import argparse
 import io
 import json
-import argparse
 import xml.etree.ElementTree as ET
-from typing import List, Any, Dict, AnyStr, Optional
+from typing import Any, AnyStr, Dict, List, Optional
 
 xml_ns = "http://www.w3.org/XML/1998/namespace"
 ns = "http://www.tei-c.org/ns/1.0"
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 2a1d8b5..0084330 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -9,10 +9,10 @@ Example of large parallel run, locally:
     cat /srv/sandcrawler/tasks/ungrobided.2019-09-23.json         | pv -l | parallel -j30 --pipe         ./grobid_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --grobid-host http://localhost:8070 -j0 extract-json -
 """
 
-import sys
-import json
 import argparse
 import datetime
+import json
+import sys
 
 from grobid2json import teixml2json
 from sandcrawler import *
diff --git a/python/ia_pdf_match.py b/python/ia_pdf_match.py
index 20c65bb..137110c 100755
--- a/python/ia_pdf_match.py
+++ b/python/ia_pdf_match.py
@@ -22,8 +22,9 @@ When invoking import matched, be sure to:
     --default-mimetype application/pdf
 """
 
-import sys
 import json
+import sys
+
 
 def parse(obj):
     if obj['metadata']['identifier'].endswith('-test') or obj['metadata'].get('test'):
diff --git a/python/ingest_tool.py b/python/ingest_tool.py
index fdb5b48..c0ef5aa 100755
--- a/python/ingest_tool.py
+++ b/python/ingest_tool.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 
-import sys
-import json
 import argparse
-
+import json
+import sys
 from http.server import HTTPServer
+
 from sandcrawler.ingest_file import IngestFileRequestHandler, IngestFileWorker
 from sandcrawler.ingest_fileset import IngestFilesetWorker
 
diff --git a/python/pdfextract_tool.py b/python/pdfextract_tool.py
index 10a0f48..89ecf1c 100755
--- a/python/pdfextract_tool.py
+++ b/python/pdfextract_tool.py
@@ -4,10 +4,10 @@
 KNOWN ISSUE: thumbnails are not published to kafka in multi-processing mode
 """
 
-import sys
-import json
 import argparse
 import datetime
+import json
+import sys
 
 from grobid2json import teixml2json
 from sandcrawler import *
diff --git a/python/pdftrio_tool.py b/python/pdftrio_tool.py
index 5cffa8c..e195bc7 100755
--- a/python/pdftrio_tool.py
+++ b/python/pdftrio_tool.py
@@ -9,10 +9,10 @@ Example of large parallel run, locally:
 cat /srv/sandcrawler/tasks/something.cdx | pv -l | parallel -j30 --pipe ./pdftrio_tool.py --kafka-env prod --kafka-hosts wbgrp-svc263.us.archive.org:9092,wbgrp-svc284.us.archive.org:9092,wbgrp-svc285.us.archive.org:9092 --kafka-mode --pdftrio-host http://localhost:3939 -j0 classify-pdf-json -
 """
 
-import sys
-import json
 import argparse
 import datetime
+import json
+import sys
 
 from sandcrawler import *
 
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 69e9374..d52f7c1 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -7,9 +7,9 @@ Normally this is done by workers (in sandcrawler_worker.py) consuming from
 Kafka feeds, but sometimes we have bulk processing output we want to backfill.
 """
 
+import argparse
 import os
 import sys
-import argparse
 
 from sandcrawler import *
 from sandcrawler.persist import *
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 4e004be..bf2d92d 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,11 +1,14 @@
 
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient,
+                 SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError)
 from .ingest_file import IngestFileWorker
 from .ingest_fileset import IngestFilesetWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker,
+                      PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
+from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink,
+                      MultiprocessWrapper, ZipfilePusher)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 9b55c0c..4dcdb0e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,12 +1,13 @@
 
-import json
 import datetime
+import json
 from typing import Optional
 
 import psycopg2
 import psycopg2.extras
 import requests
 
+
 class SandcrawlerPostgrestClient:
 
     def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 134ae7c..92fed37 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,18 +1,18 @@
 
-import sys
-import json
 import gzip
+import json
+import sys
 import time
 import urllib.parse
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
-import requests
 import internetarchive
+import requests
 
+from sandcrawler.fileset_types import *
 from sandcrawler.html_metadata import BiblioMetadata
 from sandcrawler.ia import ResourceResult
-from sandcrawler.fileset_types import *
 
 
 class FilesetPlatformHelper():
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index d12fc15..c9f182c 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,18 +1,19 @@
 
+import gzip
+import json
 import os
+import shutil
 import sys
-import json
-import gzip
 import time
-import shutil
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import internetarchive
 
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+                                       PlatformScopeError)
 from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError
+from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
 from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
 
 
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index d7e9d6d..8ea136e 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,9 +1,10 @@
 
 from enum import Enum
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import BaseModel
 
+
 class IngestStrategy(str, Enum):
     WebFile = "web-file"
     WebFileset = "web-fileset"
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b4215dc..5242b3a 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,8 +2,10 @@
 import requests
 
 from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+
 from .misc import gen_file_metadata
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
 
 class GrobidClient(object):
 
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index cd0a8e8..6bdebdd 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,7 +1,7 @@
 
+import json
 import re
 import sys
-import json
 import urllib.parse
 
 from bs4 import BeautifulSoup
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 93c7269..c6725dc 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,16 @@
 
-import sys
 import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
 import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
 
+import braveblock
 import dateparser
-from selectolax.parser import HTMLParser
 import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
 
 from sandcrawler.misc import url_fuzzy_equal
 
-
 # this is a map of metadata keys to CSS selectors
 # sources for this list include:
 #  - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a2ca346..ca1182f 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,29 +3,31 @@
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
+import datetime
+import gzip
+import http.client
+import json
 import os
 import sys
 import time
-import gzip
-import json
-import requests
-import datetime
 import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
 from collections import namedtuple
+from typing import Tuple
 
-import http.client
+import requests
+import urllib3.exceptions
 
 # not sure this will really work. Should go before wayback imports.
 http.client._MAXHEADERS = 1000  # type: ignore
 
-import wayback.exception
 from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
+
+import wayback.exception
 from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
 
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
 
 class SandcrawlerBackoffError(Exception):
     """
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 72d4e14..137a793 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,31 +1,31 @@
 
-import sys
-import json
+import base64
 import gzip
+import json
+import sys
 import time
-import base64
 import xml.etree.ElementTree
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
 from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict, List, Optional, Tuple
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
 from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
-    quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
-    WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+                            fix_transfer_encoding)
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+                                     html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
 from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
 from sandcrawler.xml import xml_reserialize
 
-
 MAX_BODY_SIZE_BYTES = 128*1024*1024
 
 class IngestFileWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 7c0dfbd..11386df 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,29 +1,28 @@
 
-import sys
-import json
 import gzip
+import json
+import sys
 import time
 from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
 
 import requests
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
-    quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
-    WebResource, html_guess_platform
-
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+                            SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+                            fix_transfer_encoding)
 from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
-
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+                                     html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.workers import SandcrawlerWorker
 
 MAX_BODY_SIZE_BYTES = 128*1024*1024
 
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 56a726d..7e6e5e3 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,20 +1,20 @@
 
+import argparse
+import datetime
 import io
-import sys
 import json
-import datetime
-import argparse
+import sys
 import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
 
-import trafilatura
 import pydantic
+import trafilatura
 from selectolax.parser import HTMLParser
 
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
+                            cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
 
 TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
 
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..b617178 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,7 +1,7 @@
 
+import hashlib
 import io
 import os
-import hashlib
 
 import minio
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 37a2a82..cf8c4bd 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,15 +1,15 @@
 
-import os
 import base64
-import magic
-import hashlib
 import datetime
+import hashlib
+import os
 from typing import Optional
 
+import magic
 import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
 import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry  # pylint: disable=import-error
 
 
 def clean_url(s: str) -> str:
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9b4e834..2fb34b8 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,17 +1,16 @@
 
-import sys
-import json
 import datetime
-from io import BytesIO
+import json
+import sys
 from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, Optional
 
 import poppler
 from PIL import Image
 
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
 from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 # This is a hack to work around timeouts when processing certain PDFs with
 # poppler. For some reason, the usual Kafka timeout catcher isn't working on
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..7d03357 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,9 +1,10 @@
 
 import time
+
 import requests
 
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
 from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 
 
 class PdfTrioClient(object):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index af702ca..66a36bc 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,15 +20,15 @@ grobid
 """
 
 import os
-from typing import Optional, AnyStr
 import xml.etree.ElementTree
+from typing import AnyStr, Optional
 
-from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import PdfExtractResult
 from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
 
 
 class PersistCdxWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..d8a4016 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,17 @@
 
-import sys
 import json
-import time
+import multiprocessing.pool
 import signal
+import sys
+import time
 import zipfile
-import requests
-import multiprocessing.pool
 from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
 
+import requests
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError
 from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
 
 
 class SandcrawlerWorker(object):
diff --git a/python/sandcrawler_worker.py b/python/sandcrawler_worker.py
index 8e275cf..e185fad 100755
--- a/python/sandcrawler_worker.py
+++ b/python/sandcrawler_worker.py
@@ -6,14 +6,15 @@ Outputs might either be pushed back into Kafka, or directly into sandcrawler-db
 or S3 (SeaweedFS).
 """
 
-import os
-import sys
 import argparse
 import datetime
+import os
+import sys
+
 import raven
 
 from sandcrawler import *
-from sandcrawler.persist import PersistXmlDocWorker, PersistHtmlTeiXmlWorker
+from sandcrawler.persist import PersistHtmlTeiXmlWorker, PersistXmlDocWorker
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
 try:
diff --git a/python/scripts/arabesque2ingestrequest.py b/python/scripts/arabesque2ingestrequest.py
index 03a1f29..69fe320 100755
--- a/python/scripts/arabesque2ingestrequest.py
+++ b/python/scripts/arabesque2ingestrequest.py
@@ -12,9 +12,9 @@ Run like:
 Can then run through requests using that tool, or dump into kafka queue.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
 
 
 def run(args):
diff --git a/python/scripts/archiveorg_fileset.py b/python/scripts/archiveorg_fileset.py
index 0e507eb..86ca062 100755
--- a/python/scripts/archiveorg_fileset.py
+++ b/python/scripts/archiveorg_fileset.py
@@ -9,13 +9,12 @@ TODO:
 - should this check the item type?
 """
 
-import sys
 import json
+import sys
 from typing import Any
 
 import internetarchive
 
-
 FORMAT_TO_MIMETYPE = {
     'BZIP': 'application/x-bzip',
     'BZIP2': 'application/x-bzip2',
diff --git a/python/scripts/cdx_collection.py b/python/scripts/cdx_collection.py
index e867b21..5e33def 100755
--- a/python/scripts/cdx_collection.py
+++ b/python/scripts/cdx_collection.py
@@ -11,12 +11,14 @@ Call with a collection name:
 """
 
 import os
-import sys
 import shutil
-import tempfile
-import requests
 import subprocess
+import sys
+import tempfile
+
 import internetarchive as ia
+import requests
+
 
 def run():
 
diff --git a/python/scripts/covid2ingestrequest.py b/python/scripts/covid2ingestrequest.py
index 33c425d..1b7c85c 100755
--- a/python/scripts/covid2ingestrequest.py
+++ b/python/scripts/covid2ingestrequest.py
@@ -4,9 +4,10 @@
 Transform an unpaywall dump (JSON) into ingest requests.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 
diff --git a/python/scripts/deliver_dumpgrobid_to_s3.py b/python/scripts/deliver_dumpgrobid_to_s3.py
index 86b3b35..62a85e6 100755
--- a/python/scripts/deliver_dumpgrobid_to_s3.py
+++ b/python/scripts/deliver_dumpgrobid_to_s3.py
@@ -23,12 +23,12 @@ Requires:
 - boto3 (AWS S3 client library)
 """
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
 
 import boto3
diff --git a/python/scripts/deliver_gwb_to_disk.py b/python/scripts/deliver_gwb_to_disk.py
index 3dcf962..ab1906a 100755
--- a/python/scripts/deliver_gwb_to_disk.py
+++ b/python/scripts/deliver_gwb_to_disk.py
@@ -7,19 +7,19 @@ Tool for bulk copying of PDFs (or other files) from GWB to local disk.
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
+from http.client import IncompleteRead
 
 import raven
 import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
 sentry_client = raven.Client()
diff --git a/python/scripts/deliver_gwb_to_s3.py b/python/scripts/deliver_gwb_to_s3.py
index 39ac000..f103205 100755
--- a/python/scripts/deliver_gwb_to_s3.py
+++ b/python/scripts/deliver_gwb_to_s3.py
@@ -33,20 +33,20 @@ Requires:
 # in `wayback` library. Means we can't run pylint.
 # pylint: skip-file
 
-import os
-import sys
-import json
+import argparse
 import base64
 import hashlib
-import argparse
+import json
+import os
+import sys
 from collections import Counter
+from http.client import IncompleteRead
 
 import boto3
 import raven
 import wayback.exception
-from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
 from gwb.loader import CDXLoaderFactory
+from wayback.resourcestore import ResourceStore
 
 # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
 sentry_client = raven.Client()
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index a7214d0..15b30a0 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -9,11 +9,12 @@ in the HTML headers and adds an ingest request on that basis. Or even just run
 the re-ingest in-process and publish a second result.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+from typing import List, Optional
+
 import urlcanon
-from typing import Optional, List
 
 DOMAIN_BLOCKLIST = [
     # large OA publishers (we get via DOI)
diff --git a/python/scripts/enrich_scored_matches.py b/python/scripts/enrich_scored_matches.py
index 9fe1499..3085346 100755
--- a/python/scripts/enrich_scored_matches.py
+++ b/python/scripts/enrich_scored_matches.py
@@ -17,9 +17,10 @@ And outputs JSON objects that are can be imported into fatcat with the
 No dependencies (only python3 stdlib)
 """
 
-import sys
-import json
 import base64
+import json
+import sys
+
 
 def run():
     for line in sys.stdin:
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index dc4bea7..d0666ce 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
-import sys
 import json
+import sys
 
 with open('title_slug_denylist.txt', 'r') as f:
     TITLE_DENYLIST = [l.strip() for l in f]
diff --git a/python/scripts/filter_groupworks.py b/python/scripts/filter_groupworks.py
index bbba770..494da71 100755
--- a/python/scripts/filter_groupworks.py
+++ b/python/scripts/filter_groupworks.py
@@ -18,8 +18,8 @@ Note: the actual importer/merger should filter the following patterns out:
 - dates differ (not just year)
 """
 
-import sys
 import json
+import sys
 
 # out of 1000
 SCORE_THRESHOLD = 900
diff --git a/python/scripts/filter_scored_matches.py b/python/scripts/filter_scored_matches.py
index 3654b87..abf81bd 100755
--- a/python/scripts/filter_scored_matches.py
+++ b/python/scripts/filter_scored_matches.py
@@ -10,8 +10,8 @@ matches, and outputs one-line-per-sha1 (aka, file).
 No dependencies (only python3 stdlib)
 """
 
-import sys
 import json
+import sys
 
 # out of 1000
 score_threshold = 900
diff --git a/python/scripts/grobid_affiliations.py b/python/scripts/grobid_affiliations.py
index 79feac1..d391f60 100755
--- a/python/scripts/grobid_affiliations.py
+++ b/python/scripts/grobid_affiliations.py
@@ -10,11 +10,12 @@ Run in bulk like:
     ls /bigger/unpaywall-transfer/2019-07-17-1741.30-dumpgrobidxml/part*gz | parallel --progress -j8 'zcat {} | ./grobid_affiliations.py > {}.affiliations'
 """
 
-import sys
 import json
+import sys
 
 from grobid2json import teixml2json
 
+
 def parse_hbase(line):
     line = line.split('\t')
     assert len(line) == 2
diff --git a/python/scripts/import_grobid_metadata.py b/python/scripts/import_grobid_metadata.py
index d01b526..8aee0be 100755
--- a/python/scripts/import_grobid_metadata.py
+++ b/python/scripts/import_grobid_metadata.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 
-import sys
-import json
 import datetime
+import json
+import sys
 
 MAX_ABSTRACT_BYTES=4096
 
diff --git a/python/scripts/ingestrequest_row2json.py b/python/scripts/ingestrequest_row2json.py
index 494ec7a..acba2a8 100755
--- a/python/scripts/ingestrequest_row2json.py
+++ b/python/scripts/ingestrequest_row2json.py
@@ -7,9 +7,9 @@ format) back in to regular ingest request JSON.
 The only difference is the name and location of some optional keys.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
 
 
 def transform(row):
diff --git a/python/scripts/manifest_converter.py b/python/scripts/manifest_converter.py
index 35cee5b..8267003 100755
--- a/python/scripts/manifest_converter.py
+++ b/python/scripts/manifest_converter.py
@@ -10,9 +10,9 @@ This was used to convert this manifest:
 to JSON format for fast fatcat importing.
 """
 
-import sys
 import json
 import sqlite3
+import sys
 
 # iterate over rows in files metadata...
 # 1. select all identified DOIs
diff --git a/python/scripts/oai2ingestrequest.py b/python/scripts/oai2ingestrequest.py
index 916f41c..315b8d2 100755
--- a/python/scripts/oai2ingestrequest.py
+++ b/python/scripts/oai2ingestrequest.py
@@ -6,9 +6,10 @@ Transform an OAI-PMH bulk dump (JSON) into ingest requests.
 Eg: https://archive.org/details/oai_harvest_20200215
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 DOMAIN_BLOCKLIST = [
diff --git a/python/scripts/pdf_thumbnail.py b/python/scripts/pdf_thumbnail.py
index af08db6..71fbe54 100755
--- a/python/scripts/pdf_thumbnail.py
+++ b/python/scripts/pdf_thumbnail.py
@@ -7,6 +7,7 @@ Originally used to benchmark and compare file size/quality.
 """
 
 import sys
+
 import poppler
 from PIL import Image
 
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 5536e6c..590b429 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -4,9 +4,10 @@
 Transform an unpaywall dump (JSON) into ingest requests.
 """
 
-import sys
-import json
 import argparse
+import json
+import sys
+
 import urlcanon
 
 DOMAIN_BLOCKLIST = [
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 36d90ef..7d950df 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -1,11 +1,11 @@
 
-import pytest
 import struct
-import responses
 
-from sandcrawler import GrobidClient, GrobidWorker, CdxLinePusher, BlackholeSink, WaybackClient
-from test_wayback import wayback_client, cdx_client
+import pytest
+import responses
+from test_wayback import cdx_client, wayback_client
 
+from sandcrawler import BlackholeSink, CdxLinePusher, GrobidClient, GrobidWorker, WaybackClient
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 
diff --git a/python/tests/test_grobid2json.py b/python/tests/test_grobid2json.py
index 8497b10..b8999b1 100644
--- a/python/tests/test_grobid2json.py
+++ b/python/tests/test_grobid2json.py
@@ -1,7 +1,9 @@
 
-import xml
 import json
+import xml
+
 import pytest
+
 from grobid2json import *
 
 
diff --git a/python/tests/test_html.py b/python/tests/test_html.py
index 9a81852..d4bffc1 100644
--- a/python/tests/test_html.py
+++ b/python/tests/test_html.py
@@ -1,10 +1,12 @@
 
 import json
+
 import pytest
 import responses
 
 from sandcrawler.html import extract_fulltext_url
 
+
 def test_extract_fulltext_url():
 
     resp = extract_fulltext_url("asdf", b"asdf")
diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py
index efd1ddf..943e5da 100644
--- a/python/tests/test_html_ingest.py
+++ b/python/tests/test_html_ingest.py
@@ -1,5 +1,6 @@
 
 import datetime
+
 import pytest
 
 from sandcrawler.ingest_html import *
diff --git a/python/tests/test_html_metadata.py b/python/tests/test_html_metadata.py
index bf26a98..7f35d55 100644
--- a/python/tests/test_html_metadata.py
+++ b/python/tests/test_html_metadata.py
@@ -1,5 +1,6 @@
 
 import datetime
+
 import pytest
 
 from sandcrawler.html_metadata import *
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index b51f721..0965fcb 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -1,12 +1,13 @@
 
 import json
+
 import pytest
 import responses
+from test_grobid import REAL_TEI_XML
+from test_savepagenow import *
+from test_wayback import *
 
 from sandcrawler import *
-from test_wayback import *
-from test_savepagenow import *
-from test_grobid import REAL_TEI_XML
 
 
 @pytest.fixture
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index 429c6b0..b501dc3 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -8,9 +8,11 @@ Simply uncomment lines to run.
 """
 
 import json
+
 import pytest
 
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata
+from sandcrawler import (CdxApiClient, CdxApiError, CdxPartial, PetaboxError, SavePageNowClient, SavePageNowError,
+                         WaybackClient, WaybackError, gen_file_metadata)
 
 
 @pytest.fixture
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index bd18e5c..0788c38 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,8 @@
 
 import pytest
 
-from sandcrawler import gen_file_metadata, gen_file_metadata_path,  b32_hex, parse_cdx_line, clean_url
+from sandcrawler import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_line
+
 
 def test_gen_file_metadata():
     
diff --git a/python/tests/test_pdfextract.py b/python/tests/test_pdfextract.py
index 255e3fb..1d334d6 100644
--- a/python/tests/test_pdfextract.py
+++ b/python/tests/test_pdfextract.py
@@ -1,13 +1,13 @@
 
-import pytest
 import struct
-import responses
+
 import poppler
+import pytest
+import responses
+from test_wayback import cdx_client, wayback_client
 
-from sandcrawler import PdfExtractWorker, PdfExtractBlobWorker, CdxLinePusher, BlackholeSink, WaybackClient
+from sandcrawler import BlackholeSink, CdxLinePusher, PdfExtractBlobWorker, PdfExtractWorker, WaybackClient
 from sandcrawler.pdfextract import process_pdf
-from test_wayback import wayback_client, cdx_client
-
 
 FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
 
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index 52f26c0..62fa515 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -1,7 +1,7 @@
 
 import pytest
 
-from sandcrawler.workers import CdxLinePusher, BlackholeSink
+from sandcrawler.workers import BlackholeSink, CdxLinePusher
 
 
 def test_cdx_line_pusher():
diff --git a/python/tests/test_savepagenow.py b/python/tests/test_savepagenow.py
index 63dd887..f3fbfda 100644
--- a/python/tests/test_savepagenow.py
+++ b/python/tests/test_savepagenow.py
@@ -1,11 +1,11 @@
 
 import json
+
 import pytest
 import responses
-
-from sandcrawler import SavePageNowClient, SavePageNowError, CdxPartial
 from test_wayback import *
 
+from sandcrawler import CdxPartial, SavePageNowClient, SavePageNowError
 
 TARGET = "http://dummy-target.dummy"
 JOB_ID = "e70f33c7-9eca-4c88-826d-26930564d7c8"
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 6bc1ca4..83311b9 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,10 +1,10 @@
 
 import json
+
 import pytest
 import responses
 
-from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
-
+from sandcrawler import CdxApiClient, CdxApiError, PetaboxError, WaybackClient, WaybackError
 
 CDX_TARGET = "http://fatcat.wiki/"
 CDX_DT = "20180812220054"
author	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:22:38 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-26 12:22:38 -0700
commit	3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
tree	b7e7e27ff2032c99fd782b3ea40daf1d12f9164e
parent	f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
download	sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip