aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-01-05 19:56:29 -0800
committerBryan Newbold <bnewbold@archive.org>2021-01-05 19:56:29 -0800
commit819a68b72c5aa330a3f7a6b91e1581163a62d9f3 (patch)
tree8234da4e8dfcff4406dc5d7b3e63be765c752cce
parent4961aef2832bc2dd3f33980ab768f7ca11e7d9a4 (diff)
downloadsandcrawler-819a68b72c5aa330a3f7a6b91e1581163a62d9f3.tar.gz
sandcrawler-819a68b72c5aa330a3f7a6b91e1581163a62d9f3.zip
doaj ingest request updates (from prod)
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index b981ab6..a7214d0 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -83,6 +83,7 @@ def transform(obj: dict) -> List[dict]:
ingest_types = CONTENT_TYPE_MAP.get((link.get('content_type') or '').lower())
if not ingest_types:
continue
+
skip = False
for domain in DOMAIN_BLOCKLIST:
if domain in link['url'].lower():
@@ -90,10 +91,13 @@ def transform(obj: dict) -> List[dict]:
if skip:
continue
try:
- base_url = canon(link['url'])
+ base_url = canon(link['url'].strip())
except UnicodeEncodeError:
continue
+ if not base_url or len(base_url) > 1000:
+ continue
+
for ingest_type in ingest_types:
request = {
'base_url': base_url,