aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest_file.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-04 16:12:19 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-15 18:15:25 -0700
commit4b3d6cb79a7182be4976aab34db251ecbcbd2665 (patch)
treecfdd6a5223b38a288af8806e08410365022be8ea /python/sandcrawler/ingest_file.py
parente2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (diff)
downloadsandcrawler-4b3d6cb79a7182be4976aab34db251ecbcbd2665.tar.gz
sandcrawler-4b3d6cb79a7182be4976aab34db251ecbcbd2665.zip
wrap up previous renaming work
Diffstat (limited to 'python/sandcrawler/ingest_file.py')
-rw-r--r--python/sandcrawler/ingest_file.py4
1 files changed, 1 insertions, 3 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index a02e923..305a5d1 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -17,7 +17,7 @@ from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime
from sandcrawler.html import extract_fulltext_url
-from sandcrawler.html_ingest import fetch_html_resources, \
+from sandcrawler.ingest_html import fetch_html_resources, \
quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \
WebResource, html_guess_platform
from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules
@@ -25,8 +25,6 @@ from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
from sandcrawler.xml import xml_reserialize
-from sandcrawler.platforms.generic import DirectFileHelper
-
MAX_BODY_SIZE_BYTES = 128*1024*1024