From 833487810b2e72ed6e22ce68dd1655bad1e87be0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 7 Apr 2020 12:38:01 -0700 Subject: unpaywall2ingestrequest: canonicalize URL --- python/scripts/unpaywall2ingestrequest.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'python/scripts') diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 2999574..5536e6c 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -7,6 +7,7 @@ Transform an unpaywall dump (JSON) into ingest requests. import sys import json import argparse +import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) @@ -31,6 +32,9 @@ RELEASE_STAGE_MAP = { 'updatedVersion': 'updated', } +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) def transform(obj): """ @@ -53,9 +57,13 @@ def transform(obj): skip = True if skip: continue + try: + base_url = canon(location['url_for_pdf']) + except UnicodeEncodeError: + continue request = { - 'base_url': location['url_for_pdf'], + 'base_url': base_url, 'ingest_type': 'pdf', 'link_source': 'unpaywall', 'link_source_id': obj['doi'].lower(), -- cgit v1.2.3