aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:22:38 -0700
commit3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch)
treeb7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler/ingest_file.py
parentf67d870ba4ca9cecd0b75f106335997c813e9df4 (diff)
downloadsandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz
sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip
python: isort all imports
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py26
1 files changed, 13 insertions, 13 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 72d4e14..137a793 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -1,31 +1,31 @@
-import sys
-import json
+import base64
import gzip
+import json
+import sys
import time
-import base64
import xml.etree.ElementTree
from collections import namedtuple
-from typing import Optional, Tuple, Any, Dict, List
from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Any, Dict, List, Optional, Tuple
import requests
from selectolax.parser import HTMLParser
-from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding, NoCaptureError
+from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.grobid import GrobidClient
-from sandcrawler.pdfextract import process_pdf, PdfExtractResult
-from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
from sandcrawler.html import extract_fulltext_url
-from sandcrawler.ingest_html import fetch_html_resources, \
- quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
- WebResource, html_guess_platform
-from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
+from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules
+from sandcrawler.ia import (CdxApiClient, CdxApiError, NoCaptureError, PetaboxError, ResourceResult, SavePageNowClient,
+ SavePageNowError, WaybackClient, WaybackContentError, WaybackError, cdx_to_dict,
+ fix_transfer_encoding)
+from sandcrawler.ingest_html import (WebResource, fetch_html_resources, html_extract_body_teixml, html_guess_platform,
+ html_guess_scope, quick_fetch_html_resources)
+from sandcrawler.misc import clean_url, gen_file_metadata, parse_cdx_datetime
+from sandcrawler.pdfextract import PdfExtractResult, process_pdf
from sandcrawler.workers import SandcrawlerWorker
-from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.xml import xml_reserialize
-
MAX_BODY_SIZE_BYTES = 128*1024*1024
class IngestFileWorker(SandcrawlerWorker):