diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:22:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:22:38 -0700 |
commit | 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch) | |
tree | b7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler/ingest_file.py | |
parent | f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff) | |
download | sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip |
python: isort all imports
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 26 |
1 files changed, 13 insertions, 13 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 72d4e14..137a793 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -1,31 +1,31 @@ -import sys -import json +import base64 import gzip +import json +import sys import time -import base64 import xml.etree.ElementTree from collections import namedtuple -from typing import Optional, Tuple, Any, Dict, List from http.server import BaseHTTPRequestHandler, HTTPServer +from typing import Any, Dict, List, Optional, Tuple import requests from selectolax.parser import HTMLParser -from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError +from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.grobid import GrobidClient -from sandcrawler.pdfextract import process_pdf, PdfExtractResult -from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime from sandcrawler.html import extract_fulltext_url -from sandcrawler.ingest_html import fetch_html_resources, \ - quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ - WebResource, html_guess_platform -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient, + SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict, + fix_transfer_encoding) +from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform, + html_guess_scope, quick_fetch_html_resources) +from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime +from sandcrawler.pdfextract import PdfExtractResult, process_pdf from sandcrawler.workers import SandcrawlerWorker -from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize - MAX_BODY_SIZE_BYTES = 128*1024*1024 class IngestFileWorker(SandcrawlerWorker): |