url cleaning (canonicalization) for ingest base_url

As mentioned in comment, this first version does not re-write the URL in the `base_url` field. If we did so, then ingest_request rows would not SQL JOIN to ingest_file_result rows, which we wouldn't want. In the future, behaviour should maybe be to refuse to process URLs that aren't clean (eg, if base_url != clean_url(base_url)) and return a 'bad-url' status or soemthing. Then we would only accept clean URLs in both tables, and clear out all old/bad URLs with a cleanup script.
author: Bryan Newbold <bnewbold@archive.org> 2020-03-10 22:40:00 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2020-03-10 23:01:20 -0700
commit: 8837977d2892beac6cf412f58dafcdbf06f323ac (patch)
tree: 40aef4358308348b4ef17d6913946711828b0eec /python
parent: e7ba648fce4b8359358c6661b6ecb34576efc70d (diff)
download: sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.tar.gz
sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.zip
4 files changed, 21 insertions, 4 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py
index 3d49096..492b558 100644
--- a/python/sandcrawler/__init__.py
+++ b/python/sandcrawler/__init__.py
@@ -1,7 +1,7 @@
 
 from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker
 from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker
-from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime
+from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url
 from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper
 from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow
 from .ingest import IngestFileWorker
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 7211ee0..5dc5b55 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -9,7 +9,7 @@ from collections import namedtuple
 
 from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult
 from sandcrawler.grobid import GrobidClient
-from sandcrawler.misc import gen_file_metadata
+from sandcrawler.misc import gen_file_metadata, clean_url
 from sandcrawler.html import extract_fulltext_url
 from sandcrawler.workers import SandcrawlerWorker
 from sandcrawler.db import SandcrawlerPostgrestClient
@@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker):
             request['ingest_type'] = "pdf"
         assert request.get('ingest_type') == "pdf"
         ingest_type = request.get('ingest_type')
-        base_url = request['base_url']
+
+        # parse/clean URL
+        # note that we pass through the original/raw URL, and that is what gets
+        # persisted in database table
+        base_url = clean_url(request['base_url'])
 
         force_recrawl = bool(request.get('force_recrawl', False))
 
diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py
index 88669e6..d9c9d55 100644
--- a/python/sandcrawler/misc.py
+++ b/python/sandcrawler/misc.py
@@ -6,8 +6,15 @@ import datetime
 import requests
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error
+import urlcanon
 
 
+def clean_url(s):
+    parsed = urlcanon.parse_url(s)
+    if not parsed.port and parsed.colon_before_port:
+        parsed.colon_before_port = b''
+    return str(urlcanon.whatwg(parsed))
+
 def gen_file_metadata(blob):
     """
     Takes a file blob (bytestream) and returns hashes and other metadata.
diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py
index 420bc07..29f9e9f 100644
--- a/python/tests/test_misc.py
+++ b/python/tests/test_misc.py
@@ -1,7 +1,7 @@
 
 import pytest
 
-from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line
+from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url
 
 def test_gen_file_metadata():
     
@@ -69,3 +69,9 @@ def test_invalid_cdx():
     print("bad datetime")
     raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" 
     assert parse_cdx_line(raw) == None
+
+def test_clean_url():
+    assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf"
+    assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \
+        "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view"
+
author	Bryan Newbold <bnewbold@archive.org>	2020-03-10 22:40:00 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2020-03-10 23:01:20 -0700
commit	8837977d2892beac6cf412f58dafcdbf06f323ac (patch)
tree	40aef4358308348b4ef17d6913946711828b0eec /python
parent	e7ba648fce4b8359358c6661b6ecb34576efc70d (diff)
download	sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.tar.gz sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.zip