aboutsummaryrefslogtreecommitdiffstats
path: root/python/scripts
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-07 12:38:01 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-07 12:38:01 -0700
commit833487810b2e72ed6e22ce68dd1655bad1e87be0 (patch)
treefec5c33c0a9cffa96a698b907c7867c6716f84d0 /python/scripts
parent5dd9e8f6790de403376811a966496b8f612f192e (diff)
downloadsandcrawler-833487810b2e72ed6e22ce68dd1655bad1e87be0.tar.gz
sandcrawler-833487810b2e72ed6e22ce68dd1655bad1e87be0.zip
unpaywall2ingestrequest: canonicalize URL
Diffstat (limited to 'python/scripts')
-rwxr-xr-xpython/scripts/unpaywall2ingestrequest.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py
index 2999574..5536e6c 100755
--- a/python/scripts/unpaywall2ingestrequest.py
+++ b/python/scripts/unpaywall2ingestrequest.py
@@ -7,6 +7,7 @@ Transform an unpaywall dump (JSON) into ingest requests.
import sys
import json
import argparse
+import urlcanon
DOMAIN_BLOCKLIST = [
# large OA publishers (we get via DOI)
@@ -31,6 +32,9 @@ RELEASE_STAGE_MAP = {
'updatedVersion': 'updated',
}
+def canon(s):
+ parsed = urlcanon.parse_url(s)
+ return str(urlcanon.whatwg(parsed))
def transform(obj):
"""
@@ -53,9 +57,13 @@ def transform(obj):
skip = True
if skip:
continue
+ try:
+ base_url = canon(location['url_for_pdf'])
+ except UnicodeEncodeError:
+ continue
request = {
- 'base_url': location['url_for_pdf'],
+ 'base_url': base_url,
'ingest_type': 'pdf',
'link_source': 'unpaywall',
'link_source_id': obj['doi'].lower(),