aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
commit3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
treeb7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler
parentf67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
downloadsandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
python: isort all imports
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/__init__.py19
-rw-r--r--python/sandcrawler/db.py3
-rw-r--r--python/sandcrawler/fileset_platforms.py10
-rw-r--r--python/sandcrawler/fileset_strategies.py13
-rw-r--r--python/sandcrawler/fileset_types.py3
-rw-r--r--python/sandcrawler/grobid.py4
-rw-r--r--python/sandcrawler/html.py2
-rw-r--r--python/sandcrawler/html_metadata.py9
-rw-r--r--python/sandcrawler/ia.py22
-rw-r--r--python/sandcrawler/ingest_file.py26
-rw-r--r--python/sandcrawler/ingest_fileset.py31
-rw-r--r--python/sandcrawler/ingest_html.py18
-rw-r--r--python/sandcrawler/minio.py2
-rw-r--r--python/sandcrawler/misc.py10
-rw-r--r--python/sandcrawler/pdfextract.py11
-rw-r--r--python/sandcrawler/pdftrio.py3
-rw-r--r--python/sandcrawler/persist.py8
-rw-r--r--python/sandcrawler/workers.py13
18 files changed, 108 insertions, 99 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 4e004be..bf2d92d 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,11 +1,14 @@
-from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
-from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
-from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
-from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
+from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient
+from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker
+from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError)
from .ingest_file import IngestFileWorker
from .ingest_fileset import IngestFilesetWorker
-from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker
-from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient
-from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker
+from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line
+from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker
+from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker
+from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker,
+ PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker)
+from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink,
+ MultiprocessWrapper, ZipfilePusher)
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 9b55c0c..4dcdb0e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -1,12 +1,13 @@
-import json
import datetime
+import json
from typing import Optional
import psycopg2
import psycopg2.extras
import requests
+
class SandcrawlerPostgrestClient:
def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs):
diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py
index 134ae7c..92fed37 100644
--- a/python/sandcrawler/fileset_platforms.py
+++ b/python/sandcrawler/fileset_platforms.py
@@ -1,18 +1,18 @@
-import sys
-import json
import gzip
+import json
+import sys
import time
import urllib.parse
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
-import requests
import internetarchive
+import requests
+from sandcrawler.fileset_types import *
from sandcrawler.html_metadata import BiblioMetadata
from sandcrawler.ia import ResourceResult
-from sandcrawler.fileset_types import *
class FilesetPlatformHelper():
diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py
index d12fc15..c9f182c 100644
--- a/python/sandcrawler/fileset_strategies.py
+++ b/python/sandcrawler/fileset_strategies.py
@@ -1,18 +1,19 @@
+import gzip
+import json
import os
+import shutil
import sys
-import json
-import gzip
import time
-import shutil
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import internetarchive
+from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy,
+ PlatformScopeError)
from sandcrawler.html_metadata import BiblioMetadata
-from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding
-from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError
+from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding
from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path
diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py
index d7e9d6d..8ea136e 100644
--- a/python/sandcrawler/fileset_types.py
+++ b/python/sandcrawler/fileset_types.py
@@ -1,9 +1,10 @@
from enum import Enum
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
from pydantic import BaseModel
+
class IngestStrategy(str, Enum):
WebFile = "web-file"
WebFileset = "web-fileset"
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index b4215dc..5242b3a 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -2,8 +2,10 @@
import requests
from grobid2json import teixml2json
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+
from .misc import gen_file_metadata
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
+
class GrobidClient(object):
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index cd0a8e8..6bdebdd 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -1,7 +1,7 @@
+import json
import re
import sys
-import json
import urllib.parse
from bs4 import BeautifulSoup
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 93c7269..c6725dc 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,16 @@
-import sys
import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
+import braveblock
import dateparser
-from selectolax.parser import HTMLParser
import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
from sandcrawler.misc import url_fuzzy_equal
-
# this is a map of metadata keys to CSS selectors
# sources for this list include:
# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index a2ca346..ca1182f 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -3,29 +3,31 @@
# in `wayback` library. Means we can't run pylint.
# pylint: skip-file
+import datetime
+import gzip
+import http.client
+import json
import os
import sys
import time
-import gzip
-import json
-import requests
-import datetime
import urllib.parse
-import urllib3.exceptions
-from typing import Tuple
from collections import namedtuple
+from typing import Tuple
-import http.client
+import requests
+import urllib3.exceptions
# not sure this will really work. Should go before wayback imports.
http.client._MAXHEADERS = 1000 # type: ignore
-import wayback.exception
from http.client import IncompleteRead
-from wayback.resourcestore import ResourceStore
+
+import wayback.exception
from gwb.loader import CDXLoaderFactory3
+from wayback.resourcestore import ResourceStore
+
+from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session
-from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url
class SandcrawlerBackoffError(Exception):
"""
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 72d4e14..137a793 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,31 +1,31 @@
-import sys
-import json
+import base64
import gzip
+import json
+import sys
import time
-import base64
import xml.etree.ElementTree
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.xml import xml_reserialize
-
MAX_BODY_SIZE_BYTES = 128*1024*1024
class IngestFileWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py
index 7c0dfbd..11386df 100644
--- a/python/sandcrawler/ingest_fileset.py
+++ b/python/sandcrawler/ingest_fileset.py
@@ -1,29 +1,28 @@
-import sys
-import json
import gzip
+import json
+import sys
import time
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
-from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
+from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper
+from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy
+from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError
+from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
from sandcrawler.ingest_file import IngestFileWorker
-from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE
-from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE
-from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError
-
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.workers import SandcrawlerWorker
MAX_BODY_SIZE_BYTES = 128*1024*1024
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py
index 56a726d..7e6e5e3 100644
--- a/python/sandcrawler/ingest_html.py
+++ b/python/sandcrawler/ingest_html.py
@@ -1,20 +1,20 @@
+import argparse
+import datetime
import io
-import sys
import json
-import datetime
-import argparse
+import sys
import xml.etree.ElementTree as ET
-from typing import List, Optional, Any, Tuple
+from typing import Any, List, Optional, Tuple
-import trafilatura
import pydantic
+import trafilatura
from selectolax.parser import HTMLParser
-from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError
-from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
-
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError,
+ cdx_to_dict, fix_transfer_encoding)
+from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal
TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}"
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..b617178 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,7 +1,7 @@
+import hashlib
import io
import os
-import hashlib
import minio
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 37a2a82..cf8c4bd 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -1,15 +1,15 @@
-import os
import base64
-import magic
-import hashlib
import datetime
+import hashlib
+import os
from typing import Optional
+import magic
import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
import urlcanon
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
def clean_url(s: str) -> str:
diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py
index 9b4e834..2fb34b8 100644
--- a/python/sandcrawler/pdfextract.py
+++ b/python/sandcrawler/pdfextract.py
@@ -1,17 +1,16 @@
-import sys
-import json
import datetime
-from io import BytesIO
+import json
+import sys
from dataclasses import dataclass
-from typing import Optional, Dict, Any
+from io import BytesIO
+from typing import Any, Dict, Optional
import poppler
from PIL import Image
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata
-
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
# This is a hack to work around timeouts when processing certain PDFs with
# poppler. For some reason, the usual Kafka timeout catcher isn't working on
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 161dc9c..7d03357 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,9 +1,10 @@
import time
+
import requests
-from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
from .misc import gen_file_metadata, requests_retry_session
+from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index af702ca..66a36bc 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -20,15 +20,15 @@ grobid
"""
import os
-from typing import Optional, AnyStr
import xml.etree.ElementTree
+from typing import AnyStr, Optional
-from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgresClient
-from sandcrawler.minio import SandcrawlerMinioClient
from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import PdfExtractResult
from sandcrawler.ingest_html import HtmlMetaRow
+from sandcrawler.minio import SandcrawlerMinioClient
+from sandcrawler.pdfextract import PdfExtractResult
+from sandcrawler.workers import SandcrawlerWorker
class PersistCdxWorker(SandcrawlerWorker):
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index 37e3d7a..d8a4016 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -1,16 +1,17 @@
-import sys
import json
-import time
+import multiprocessing.pool
import signal
+import sys
+import time
import zipfile
-import requests
-import multiprocessing.pool
from collections import Counter
-from confluent_kafka import Consumer, Producer, KafkaException
+import requests
+from confluent_kafka import Consumer, KafkaException, Producer
+
+from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError
from .misc import parse_cdx_line
-from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError
class SandcrawlerWorker(object):