diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:22:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:22:38 -0700 |
commit | 3cdf4af9be4c762ff2ed79a57b5ad30637909f1e (patch) | |
tree | b7e7e27ff2032c99fd782b3ea40daf1d12f9164e /python/sandcrawler/ingest_html.py | |
parent | f67d870ba4ca9cecd0b75f106335997c813e9df4 (diff) | |
download | sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.tar.gz sandcrawler-3cdf4af9be4c762ff2ed79a57b5ad30637909f1e.zip |
python: isort all imports
Diffstat (limited to 'python/sandcrawler/ingest_html.py')
-rw-r--r-- | python/sandcrawler/ingest_html.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index 56a726d..7e6e5e3 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -1,20 +1,20 @@ +import argparse +import datetime import io -import sys import json -import datetime -import argparse +import sys import xml.etree.ElementTree as ET -from typing import List, Optional, Any, Tuple +from typing import Any, List, Optional, Tuple -import trafilatura import pydantic +import trafilatura from selectolax.parser import HTMLParser -from sandcrawler.ia import WaybackClient, CdxApiClient, ResourceResult, cdx_to_dict, fix_transfer_encoding, NoCaptureError, WaybackContentError -from sandcrawler.misc import gen_file_metadata, parse_cdx_datetime, datetime_to_cdx, clean_url, url_fuzzy_equal -from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules - +from sandcrawler.html_metadata import BiblioMetadata, html_extract_biblio, html_extract_resources, load_adblock_rules +from sandcrawler.ia import (CdxApiClient, NoCaptureError, ResourceResult, WaybackClient, WaybackContentError, + cdx_to_dict, fix_transfer_encoding) +from sandcrawler.misc import clean_url, datetime_to_cdx, gen_file_metadata, parse_cdx_datetime, url_fuzzy_equal TRAFILATURA_AGENT = f"trafilatura/{trafilatura.__version__}" |