diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-05 19:56:29 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-05 19:56:29 -0800 |
commit | 819a68b72c5aa330a3f7a6b91e1581163a62d9f3 (patch) | |
tree | 8234da4e8dfcff4406dc5d7b3e63be765c752cce | |
parent | 4961aef2832bc2dd3f33980ab768f7ca11e7d9a4 (diff) | |
download | sandcrawler-819a68b72c5aa330a3f7a6b91e1581163a62d9f3.tar.gz sandcrawler-819a68b72c5aa330a3f7a6b91e1581163a62d9f3.zip |
doaj ingest request updates (from prod)
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index b981ab6..a7214d0 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -83,6 +83,7 @@ def transform(obj: dict) -> List[dict]: ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower()) if not ingest_types: continue + skip = False for domain in DOMAIN_BLOCKLIST: if domain in link['url'].lower(): @@ -90,10 +91,13 @@ def transform(obj: dict) -> List[dict]: if skip: continue try: - base_url = canon(link['url']) + base_url = canon(link['url'].strip()) except UnicodeEncodeError: continue + if not base_url or len(base_url) > 1000: + continue + for ingest_type in ingest_types: request = { 'base_url': base_url, |