From 8837977d2892beac6cf412f58dafcdbf06f323ac Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 10 Mar 2020 22:40:00 -0700
Subject: url cleaning (canonicalization) for ingest base_url

As mentioned in comment, this first version does not re-write the URL in
the `base_url` field. If we did so, then ingest_request rows would not
SQL JOIN to ingest_file_result rows, which we wouldn't want.

In the future, behaviour should maybe be to refuse to process URLs that
aren't clean (eg, if base_url != clean_url(base_url)) and return a
'bad-url' status or soemthing. Then we would only accept clean URLs in
both tables, and clear out all old/bad URLs with a cleanup script.
---
 python/sandcrawler/__init__.py | 2 +-
 python/sandcrawler/ingest.py   | 8 ++++++--
 python/sandcrawler/misc.py     | 7 +++++++
 python/tests/test_misc.py      | 8 +++++++-
 4 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 3d49096..492b558 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
 
 from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
 from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
 from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
 from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
 from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..5dc5b55 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
 
 from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
 from sandcrawler.html import extract_fulltext_url
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
@@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker):
             request['ingest_type'] = "pdf"
         assert request.get('ingest_type') == "pdf"
         ingest_type = request.get('ingest_type')
-        base_url = request['base_url']
+
+        # parse/clean URL
+        # note that we pass through the original/raw URL, and that is what gets
+        # persisted in database table
+        base_url = clean_url(request['base_url'])
 
         force_recrawl = bool(request.get('force_recrawl', False))
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
 import requests
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
 
 
+def clean_url(s):
+    parsed = urlcanon.parse_url(s)
+    if not parsed.port and parsed.colon_before_port:
+        parsed.colon_before_port = b''
+    return str(urlcanon.whatwg(parsed))
+
 def gen_file_metadata(blob):
     """
     Takes a file blob (bytestream) and returns hashes and other metadata.
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 420bc07..29f9e9f 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
 
 import pytest
 
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
 
 def test_gen_file_metadata():
     
@@ -69,3 +69,9 @@ def test_invalid_cdx():
     print("bad datetime")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" 
     assert parse_cdx_line(raw) == None
+
+def test_clean_url():
+    assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+    assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
+        "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+
-- 
cgit v1.2.3