diff options
-rwxr-xr-x | python/scripts/unpaywall2ingestrequest.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/scripts/unpaywall2ingestrequest.py b/python/scripts/unpaywall2ingestrequest.py index 2999574..5536e6c 100755 --- a/python/scripts/unpaywall2ingestrequest.py +++ b/python/scripts/unpaywall2ingestrequest.py @@ -7,6 +7,7 @@ Transform an unpaywall dump (JSON) into ingest requests. import sys import json import argparse +import urlcanon DOMAIN_BLOCKLIST = [ # large OA publishers (we get via DOI) @@ -31,6 +32,9 @@ RELEASE_STAGE_MAP = { 'updatedVersion': 'updated', } +def canon(s): + parsed = urlcanon.parse_url(s) + return str(urlcanon.whatwg(parsed)) def transform(obj): """ @@ -53,9 +57,13 @@ def transform(obj): skip = True if skip: continue + try: + base_url = canon(location['url_for_pdf']) + except UnicodeEncodeError: + continue request = { - 'base_url': location['url_for_pdf'], + 'base_url': base_url, 'ingest_type': 'pdf', 'link_source': 'unpaywall', 'link_source_id': obj['doi'].lower(), |