diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-07 12:38:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-07 12:38:01 -0700 |
commit | 833487810b2e72ed6e22ce68dd1655bad1e87be0 (patch) | |
tree | fec5c33c0a9cffa96a698b907c7867c6716f84d0 /python/scripts | |
parent | 5dd9e8f6790de403376811a966496b8f612f192e (diff) | |
download | sandcrawler-833487810b2e72ed6e22ce68dd1655bad1e87be0.tar.gz sandcrawler-833487810b2e72ed6e22ce68dd1655bad1e87be0.zip |
unpaywall2ingestrequest: canonicalize URL
Diffstat (limited to 'python/scripts')
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 2999574..5536e6c 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -7,6 +7,7 @@ Transform an unpaywall dump (JSON) into ingest requests. import sys import json import argparse +import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) @@ -31,6 +32,9 @@ RELEASE_STAGE_MAP = { 'updatedVersion': 'updated', } +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) def transform(obj): """ @@ -53,9 +57,13 @@ def transform(obj): skip = True if skip: continue + try: + base_url = canon(location['url_for_pdf']) + except UnicodeEncodeError: + continue request = { - 'base_url': location['url_for_pdf'], + 'base_url': base_url, 'ingest_type': 'pdf', 'link_source': 'unpaywall', 'link_source_id': obj['doi'].lower(), |