aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py8
1 files changed, 6 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..5dc5b55 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
from sandcrawler.html import extract_fulltext_url
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
@@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker):
request['ingest_type'] = "pdf"
assert request.get('ingest_type') == "pdf"
ingest_type = request.get('ingest_type')
- base_url = request['base_url']
+
+ # parse/clean URL
+ # note that we pass through the original/raw URL, and that is what gets
+ # persisted in database table
+ base_url = clean_url(request['base_url'])
force_recrawl = bool(request.get('force_recrawl', False))