diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 15:46:15 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 15:46:15 -0800 |
commit | b3c56dc23a9a4c33ba6a4f381a760f34ad1ac361 (patch) | |
tree | 4c338ca3f5978898b31a940af1907fcd18ac1c1d | |
parent | 6431b2f6b4bd9bd4dea4b373b89eb3f89648cc4c (diff) | |
download | sandcrawler-b3c56dc23a9a4c33ba6a4f381a760f34ad1ac361.tar.gz sandcrawler-b3c56dc23a9a4c33ba6a4f381a760f34ad1ac361.zip |
ingest: better non-full URL fixup
-rw-r--r-- | python/sandcrawler/ia.py | 7 |
1 files changed, 3 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index e2e97a7..f6eee72 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -10,6 +10,7 @@ import gzip import json import requests import datetime +import urllib.parse from typing import Tuple from collections import namedtuple @@ -694,10 +695,8 @@ class WaybackClient: cdx=cdx_row, revisit_cdx=None, ) - if resource.location.startswith('/'): - # redirect location does not include hostname - domain_prefix = '/'.join(next_url.split('/')[:3]) - next_url = domain_prefix + resource.location + if not "://" in resource.location: + next_url = urllib.parse.urljoin(next_url, resource.location) else: next_url = resource.location if next_url: |