diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-04 16:12:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-15 18:15:25 -0700 |
commit | 4b3d6cb79a7182be4976aab34db251ecbcbd2665 (patch) | |
tree | cfdd6a5223b38a288af8806e08410365022be8ea /python | |
parent | e2e0602114ccdf142b3ef0f30c67d2cb7a58ef7e (diff) | |
download | sandcrawler-4b3d6cb79a7182be4976aab34db251ecbcbd2665.tar.gz sandcrawler-4b3d6cb79a7182be4976aab34db251ecbcbd2665.zip |
wrap up previous renaming work
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest_file.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/ingest_html.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 2 | ||||
-rw-r--r-- | python/tests/test_html_ingest.py | 2 |
4 files changed, 4 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index a02e923..305a5d1 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -17,7 +17,7 @@ from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import process_pdf, PdfExtractResult from sandcrawler.misc import gen_file_metadata, clean_url, parse_cdx_datetime from sandcrawler.html import extract_fulltext_url -from sandcrawler.html_ingest import fetch_html_resources, \ +from sandcrawler.ingest_html import fetch_html_resources, \ quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ WebResource, html_guess_platform from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules @@ -25,8 +25,6 @@ from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize -from sandcrawler.platforms.generic import DirectFileHelper - MAX_BODY_SIZE_BYTES = 128*1024*1024 diff --git a/python/sandcrawler/ingest_html.py b/python/sandcrawler/ingest_html.py index f11cac4..56a726d 100644 --- a/python/sandcrawler/ingest_html.py +++ b/python/sandcrawler/ingest_html.py @@ -396,7 +396,7 @@ def main() -> None: """ Run this command like: - python -m sandcrawler.html_ingest + python -m sandcrawler.ingest_html """ parser = argparse.ArgumentParser( diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index a388b90..ee153ab 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -28,7 +28,7 @@ from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.minio import SandcrawlerMinioClient from sandcrawler.grobid import GrobidClient from sandcrawler.pdfextract import PdfExtractResult -from sandcrawler.html_ingest import HtmlMetaRow +from sandcrawler.ingest_html import HtmlMetaRow class PersistCdxWorker(SandcrawlerWorker): diff --git a/python/tests/test_html_ingest.py b/python/tests/test_html_ingest.py index e6e48ac..efd1ddf 100644 --- a/python/tests/test_html_ingest.py +++ b/python/tests/test_html_ingest.py @@ -2,7 +2,7 @@ import datetime import pytest -from sandcrawler.html_ingest import * +from sandcrawler.ingest_html import * def test_html_extract_ojs3() -> None: |