From 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 26 Oct 2021 12:22:38 -0700 Subject: python: isort all imports --- python/sandcrawler/__init__.py | 19 +++++++++++-------- python/sandcrawler/db.py | 3 ++- python/sandcrawler/fileset_platforms.py | 10 +++++----- python/sandcrawler/fileset_strategies.py | 13 +++++++------ python/sandcrawler/fileset_types.py | 3 ++- python/sandcrawler/grobid.py | 4 +++- python/sandcrawler/html.py | 2 +- python/sandcrawler/html_metadata.py | 9 ++++----- python/sandcrawler/ia.py | 22 ++++++++++++---------- python/sandcrawler/ingest_file.py | 26 +++++++++++++------------- python/sandcrawler/ingest_fileset.py | 31 +++++++++++++++---------------- python/sandcrawler/ingest_html.py | 18 +++++++++--------- python/sandcrawler/minio.py | 2 +- python/sandcrawler/misc.py | 10 +++++----- python/sandcrawler/pdfextract.py | 11 +++++------ python/sandcrawler/pdftrio.py | 3 ++- python/sandcrawler/persist.py | 8 ++++---- python/sandcrawler/workers.py | 13 +++++++------ 18 files changed, 108 insertions(+), 99 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 4e004be..bf2d92d 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,11 +1,14 @@ -from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker -from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, gen_file_metadata_path, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url -from .workers import KafkaSink, KafkaCompressSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper -from .ia import WaybackClient, WaybackError, WaybackContentError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow +from .db import SandcrawlerPostgresClient, SandcrawlerPostgrestClient +from .grobid import GrobidBlobWorker, GrobidClient, GrobidWorker +from .ia import (CdxApiClient, CdxApiError, CdxPartial, CdxRow, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WarcResource, WaybackClient, WaybackContentError, WaybackError) from .ingest_file import IngestFileWorker from .ingest_fileset import IngestFilesetWorker -from .persist import PersistCdxWorker, PersistIngestFileResultWorker, PersistGrobidWorker, PersistGrobidDiskWorker, PersistPdfTrioWorker, PersistIngestRequestWorker, PersistPdfTextWorker, PersistThumbnailWorker -from .db import SandcrawlerPostgrestClient, SandcrawlerPostgresClient -from .pdfextract import PdfExtractWorker, PdfExtractBlobWorker +from .misc import b32_hex, clean_url, gen_file_metadata, gen_file_metadata_path, parse_cdx_datetime, parse_cdx_line +from .pdfextract import PdfExtractBlobWorker, PdfExtractWorker +from .pdftrio import PdfTrioBlobWorker, PdfTrioClient, PdfTrioWorker +from .persist import (PersistCdxWorker, PersistGrobidDiskWorker, PersistGrobidWorker, PersistIngestFileResultWorker, + PersistIngestRequestWorker, PersistPdfTextWorker, PersistPdfTrioWorker, PersistThumbnailWorker) +from .workers import (BlackholeSink, CdxLinePusher, JsonLinePusher, KafkaCompressSink, KafkaJsonPusher, KafkaSink, + MultiprocessWrapper, ZipfilePusher) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 9b55c0c..4dcdb0e 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -1,12 +1,13 @@ -import json import datetime +import json from typing import Optional import psycopg2 import psycopg2.extras import requests + class SandcrawlerPostgrestClient: def __init__(self, api_url="http://wbgrp-svc506.us.archive.org:3030", **kwargs): diff --git a/python/sandcrawler/fileset_platforms.py b/python/sandcrawler/fileset_platforms.py index 134ae7c..92fed37 100644 --- a/python/sandcrawler/fileset_platforms.py +++ b/python/sandcrawler/fileset_platforms.py @@ -1,18 +1,18 @@ -import sys -import json import gzip +import json +import sys import time import urllib.parse from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple -import requests import internetarchive +import requests +from sandcrawler.fileset_types import * from sandcrawler.html_metadata import BiblioMetadata from sandcrawler.ia import ResourceResult -from sandcrawler.fileset_types import * class FilesetPlatformHelper(): diff --git a/python/sandcrawler/fileset_strategies.py b/python/sandcrawler/fileset_strategies.py index d12fc15..c9f182c 100644 --- a/python/sandcrawler/fileset_strategies.py +++ b/python/sandcrawler/fileset_strategies.py @@ -1,18 +1,19 @@ +import gzip +import json import os +import shutil import sys -import json -import gzip import time -import shutil from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import internetarchive +from sandcrawler.fileset_types import (ArchiveStrategyResult, FilesetManifestFile, FilesetPlatformItem, IngestStrategy, + PlatformScopeError) from sandcrawler.html_metadata import BiblioMetadata -from sandcrawler.ia import ResourceResult, WaybackClient, SavePageNowClient, fix_transfer_encoding -from sandcrawler.fileset_types import IngestStrategy, FilesetManifestFile, FilesetPlatformItem, ArchiveStrategyResult, PlatformScopeError +from sandcrawler.ia import ResourceResult, SavePageNowClient, WaybackClient, fix_transfer_encoding from sandcrawler.misc import gen_file_metadata, gen_file_metadata_path, sanitize_fs_path diff --git a/python/sandcrawler/fileset_types.py b/python/sandcrawler/fileset_types.py index d7e9d6d..8ea136e 100644 --- a/python/sandcrawler/fileset_types.py +++ b/python/sandcrawler/fileset_types.py @@ -1,9 +1,10 @@ from enum import Enum -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple from pydantic import BaseModel + class IngestStrategy(str, Enum): WebFile = "web-file" WebFileset = "web-fileset" diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index b4215dc..5242b3a 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -2,8 +2,10 @@ import requests from grobid2json import teixml2json -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker + from .misc import gen_file_metadata +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker + class GrobidClient(object): diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index cd0a8e8..6bdebdd 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -1,7 +1,7 @@ +import json import re import sys -import json import urllib.parse from bs4 import BeautifulSoup diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 93c7269..c6725dc 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,17 +1,16 @@ -import sys import datetime -from typing import List, Optional, Any, Tuple, Dict +import sys import urllib.parse +from typing import Any, Dict, List, Optional, Tuple +import braveblock import dateparser -from selectolax.parser import HTMLParser import pydantic -import braveblock +from selectolax.parser import HTMLParser from sandcrawler.misc import url_fuzzy_equal - # this is a map of metadata keys to CSS selectors # sources for this list include: # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index a2ca346..ca1182f 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -3,29 +3,31 @@ # in `wayback` library. Means we can't run pylint. # pylint: skip-file +import datetime +import gzip +import http.client +import json import os import sys import time -import gzip -import json -import requests -import datetime import urllib.parse -import urllib3.exceptions -from typing import Tuple from collections import namedtuple +from typing import Tuple -import http.client +import requests +import urllib3.exceptions # not sure this will really work. Should go before wayback imports. http.client._MAXHEADERS = 1000 # type: ignore -import wayback.exception from http.client import IncompleteRead -from wayback.resourcestore import ResourceStore + +import wayback.exception from gwb.loader import CDXLoaderFactory3 +from wayback.resourcestore import ResourceStore + +from .misc import b32_hex, clean_url, gen_file_metadata, requests_retry_session -from .misc import b32_hex, requests_retry_session, gen_file_metadata, clean_url class SandcrawlerBackoffError(Exception): """ diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 72d4e14..137a793 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,31 +1,31 @@ -import sys -import json +import base64 import gzip +import json +import sys import time -import base64 import xml.etree.ElementTree from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError +from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient -from sandcrawler.pdfextract import process_pdf, PdfExtractResult -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.pdfextract import PdfExtractResult, process_pdf from sandcrawler.workers import SandcrawlerWorker -from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize - MAX_BODY_SIZE_BYTES = 128*1024*1024 class IngestFileWorker(SandcrawlerWorker): diff --git a/python/sandcrawler/ingest_fileset.py b/python/sandcrawler/ingest_fileset.py index 7c0dfbd..11386df 100644 --- a/python/sandcrawler/ingest_fileset.py +++ b/python/sandcrawler/ingest_fileset.py @@ -1,29 +1,28 @@ -import sys -import json import gzip +import json +import sys import time from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime -from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform - -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules -from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient +from sandcrawler.fileset_platforms import DATASET_PLATFORM_HELPER_TABLE, FilesetPlatformHelper +from sandcrawler.fileset_strategies import FILESET_STRATEGY_HELPER_TABLE, FilesetIngestStrategy +from sandcrawler.fileset_types import PlatformRestrictedError, PlatformScopeError +from sandcrawler.html import extract_fulltext_url +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) from sandcrawler.ingest_file import IngestFileWorker -from sandcrawler.fileset_platforms import FilesetPlatformHelper, DATASET_PLATFORM_HELPER_TABLE -from sandcrawler.fileset_strategies import FilesetIngestStrategy, FILESET_STRATEGY_HELPER_TABLE -from sandcrawler.fileset_types import PlatformScopeError, PlatformRestrictedError - +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.workers import SandcrawlerWorker MAX_BODY_SIZE_BYTES = 128*1024*1024 diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 56a726d..7e6e5e3 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,20 +1,20 @@ +import argparse +import datetime import io -import sys import json -import datetime -import argparse +import sys import xml.etree.ElementTree as ET -from typing import List, Optional, Any, Tuple +from typing import Any, List, Optional, Tuple -import trafilatura import pydantic +import trafilatura from selectolax.parser import HTMLParser -from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError -from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules - +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError, + cdx_to_dict, fix_transfer_encoding) +from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index c7deea1..b617178 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -1,7 +1,7 @@ +import hashlib import io import os -import hashlib import minio diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 37a2a82..cf8c4bd 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -1,15 +1,15 @@ -import os import base64 -import magic -import hashlib import datetime +import hashlib +import os from typing import Optional +import magic import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error import urlcanon +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error def clean_url(s: str) -> str: diff --git a/python/sandcrawler/pdfextract.py b/python/sandcrawler/pdfextract.py index 9b4e834..2fb34b8 100644 --- a/python/sandcrawler/pdfextract.py +++ b/python/sandcrawler/pdfextract.py @@ -1,17 +1,16 @@ -import sys -import json import datetime -from io import BytesIO +import json +import sys from dataclasses import dataclass -from typing import Optional, Dict, Any +from io import BytesIO +from typing import Any, Dict, Optional import poppler from PIL import Image -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata - +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker # This is a hack to work around timeouts when processing certain PDFs with # poppler. For some reason, the usual Kafka timeout catcher isn't working on diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 161dc9c..7d03357 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -1,9 +1,10 @@ import time + import requests -from .workers import SandcrawlerWorker, SandcrawlerFetchWorker from .misc import gen_file_metadata, requests_retry_session +from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class PdfTrioClient(object): diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index af702ca..66a36bc 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,15 +20,15 @@ grobid """ import os -from typing import Optional, AnyStr import xml.etree.ElementTree +from typing import AnyStr, Optional -from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgresClient -from sandcrawler.minio import SandcrawlerMinioClient from sandcrawler.grobid import GrobidClient -from sandcrawler.pdfextract import PdfExtractResult from sandcrawler.ingest_html import HtmlMetaRow +from sandcrawler.minio import SandcrawlerMinioClient +from sandcrawler.pdfextract import PdfExtractResult +from sandcrawler.workers import SandcrawlerWorker class PersistCdxWorker(SandcrawlerWorker): diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index 37e3d7a..d8a4016 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -1,16 +1,17 @@ -import sys import json -import time +import multiprocessing.pool import signal +import sys +import time import zipfile -import requests -import multiprocessing.pool from collections import Counter -from confluent_kafka import Consumer, Producer, KafkaException +import requests +from confluent_kafka import Consumer, KafkaException, Producer + +from .ia import PetaboxError, SandcrawlerBackoffError, WaybackContentError, WaybackError from .misc import parse_cdx_line -from .ia import SandcrawlerBackoffError, WaybackError, WaybackContentError, PetaboxError class SandcrawlerWorker(object): -- cgit v1.2.3