diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 22:40:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-10 23:01:20 -0700 |
commit | 8837977d2892beac6cf412f58dafcdbf06f323ac (patch) | |
tree | 40aef4358308348b4ef17d6913946711828b0eec /python | |
parent | e7ba648fce4b8359358c6661b6ecb34576efc70d (diff) | |
download | sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.tar.gz sandcrawler-8837977d2892beac6cf412f58dafcdbf06f323ac.zip |
url cleaning (canonicalization) for ingest base_url
As mentioned in comment, this first version does not re-write the URL in
the `base_url` field. If we did so, then ingest_request rows would not
SQL JOIN to ingest_file_result rows, which we wouldn't want.
In the future, behaviour should maybe be to refuse to process URLs that
aren't clean (eg, if base_url != clean_url(base_url)) and return a
'bad-url' status or soemthing. Then we would only accept clean URLs in
both tables, and clear out all old/bad URLs with a cleanup script.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/__init__.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/misc.py | 7 | ||||
-rw-r--r-- | python/tests/test_misc.py | 8 |
4 files changed, 21 insertions, 4 deletions
diff --git a/python/sandcrawler/__init__.py b/python/sandcrawler/__init__.py index 3d49096..492b558 100644 --- a/python/sandcrawler/__init__.py +++ b/python/sandcrawler/__init__.py @@ -1,7 +1,7 @@ from .grobid import GrobidClient, GrobidWorker, GrobidBlobWorker from .pdftrio import PdfTrioClient, PdfTrioWorker, PdfTrioBlobWorker -from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime +from .misc import gen_file_metadata, b32_hex, parse_cdx_line, parse_cdx_datetime, clean_url from .workers import KafkaSink, KafkaGrobidSink, JsonLinePusher, CdxLinePusher, CdxLinePusher, KafkaJsonPusher, BlackholeSink, ZipfilePusher, MultiprocessWrapper from .ia import WaybackClient, WaybackError, CdxApiClient, CdxApiError, SavePageNowClient, SavePageNowError, PetaboxError, ResourceResult, WarcResource, CdxPartial, CdxRow from .ingest import IngestFileWorker diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 7211ee0..5dc5b55 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -9,7 +9,7 @@ from collections import namedtuple from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult from sandcrawler.grobid import GrobidClient -from sandcrawler.misc import gen_file_metadata +from sandcrawler.misc import gen_file_metadata, clean_url from sandcrawler.html import extract_fulltext_url from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient @@ -224,7 +224,11 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = "pdf" assert request.get('ingest_type') == "pdf" ingest_type = request.get('ingest_type') - base_url = request['base_url'] + + # parse/clean URL + # note that we pass through the original/raw URL, and that is what gets + # persisted in database table + base_url = clean_url(request['base_url']) force_recrawl = bool(request.get('force_recrawl', False)) diff --git a/python/sandcrawler/misc.py b/python/sandcrawler/misc.py index 88669e6..d9c9d55 100644 --- a/python/sandcrawler/misc.py +++ b/python/sandcrawler/misc.py @@ -6,8 +6,15 @@ import datetime import requests from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry # pylint: disable=import-error +import urlcanon +def clean_url(s): + parsed = urlcanon.parse_url(s) + if not parsed.port and parsed.colon_before_port: + parsed.colon_before_port = b'' + return str(urlcanon.whatwg(parsed)) + def gen_file_metadata(blob): """ Takes a file blob (bytestream) and returns hashes and other metadata. diff --git a/python/tests/test_misc.py b/python/tests/test_misc.py index 420bc07..29f9e9f 100644 --- a/python/tests/test_misc.py +++ b/python/tests/test_misc.py @@ -1,7 +1,7 @@ import pytest -from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line +from sandcrawler import gen_file_metadata, b32_hex, parse_cdx_line, clean_url def test_gen_file_metadata(): @@ -69,3 +69,9 @@ def test_invalid_cdx(): print("bad datetime") raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz" assert parse_cdx_line(raw) == None + +def test_clean_url(): + assert clean_url("http://BLAH.COM/file.pdf") == "http://blah.com/file.pdf" + assert clean_url("https://opensky.ucar.edu:/islandora/object/articles%3A10809/datastream/PDF/view") == \ + "https://opensky.ucar.edu/islandora/object/articles%3A10809/datastream/PDF/view" + |