From 8837977d2892beac6cf412f58dafcdbf06f323ac Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Mar 2020 22:40:00 -0700 Subject: url cleaning (canonicalization) for ingest base_url As mentioned in comment, this first version does not re-write the URL in the `base_url` field. If we did so, then ingest_request rows would not SQL JOIN to ingest_file_result rows, which we wouldn't want. In the future, behaviour should maybe be to refuse to process URLs that aren't clean (eg, if base_url != clean_url(base_url)) and return a 'bad-url' status or soemthing. Then we would only accept clean URLs in both tables, and clear out all old/bad URLs with a cleanup script. --- python/sandcrawler/ingest.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'python/sandcrawler/ingest.py') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7211ee0..5dc5b55 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,7 +9,7 @@ from collections import namedtuple from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient -from sandcrawler.misc import gen_file_metadata +from sandcrawler.misc import gen_file_metadata, clean_url from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient @@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') - base_url = request['base_url'] + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) force_recrawl = bool(request.get('force_recrawl', False)) -- cgit v1.2.3