url cleaning (canonicalization) for ingest base_url

As mentioned in comment, this first version does not re-write the URL in the `base_url` field. If we did so, then ingest_request rows would not SQL JOIN to ingest_file_result rows, which we wouldn't want. In the future, behaviour should maybe be to refuse to process URLs that aren't clean (eg, if base_url != clean_url(base_url)) and return a 'bad-url' status or soemthing. Then we would only accept clean URLs in both tables, and clear out all old/bad URLs with a cleanup script.
author: Bryan Newbold <bnewbold@archive.org> 2020-03-10 22:40:00 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-10 23:01:20 -0700
commit: 8837977d2892beac6cf412f58dafcdbf06f323ac (patch)
tree: 40aef4358308348b4ef17d6913946711828b0eec /python/sandcrawler/ingest.py
parent: e7ba648fce4b8359358c6661b6ecb34576efc70d (diff)
download: sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.tar.gz
sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.zip
1 files changed, 6 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..5dc5b55 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
 
 from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
 from sandcrawler.html import extract_fulltext_url
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
@@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker):
             request['ingest_type'] = "pdf"
         assert request.get('ingest_type') == "pdf"
         ingest_type = request.get('ingest_type')
-        base_url = request['base_url']
+
+        # parse/clean URL
+        # note that we pass through the original/raw URL, and that is what gets
+        # persisted in database table
+        base_url = clean_url(request['base_url'])
 
         force_recrawl = bool(request.get('force_recrawl', False))
author	Bryan Newbold <bnewbold@archive.org>	2020-03-10 22:40:00 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-10 23:01:20 -0700
commit	8837977d2892beac6cf412f58dafcdbf06f323ac (patch)
tree	40aef4358308348b4ef17d6913946711828b0eec /python/sandcrawler/ingest.py
parent	e7ba648fce4b8359358c6661b6ecb34576efc70d (diff)
download	sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.tar.gz sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.zip