aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-13 23:30:12 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-13 23:30:12 -0700
commit7a8518adae2997a507e21eae6d6a99b25b03c52d (patch)
tree7d0ff2966a2cb87603210fee57dd8ae100cc6003
parent54a641129317372a2fcdb3c4c44d319ecb0bc0fc (diff)
downloadfatcat-scholar-7a8518adae2997a507e21eae6d6a99b25b03c52d.tar.gz
fatcat-scholar-7a8518adae2997a507e21eae6d6a99b25b03c52d.zip
URL cleanup helper
-rw-r--r--fatcat_scholar/schema.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index f171716..d75dae8 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -248,6 +248,34 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
return None
+def clean_url_conservative(url: Optional[str]) -> Optional[str]:
+ """
+ Takes a string which is expected to be a URL, and does some light cleanups.
+ If the string looks messy, passes it through anyways for downstream
+ processing.
+
+ TODO: attempt URL decoding
+ """
+ if not url:
+ return None
+ if url.startswith("<"):
+ url = url[1:]
+ if ">" in url:
+ url = url.split(">")[0]
+ return url
+
+
+def test_clean_url_conservative() -> None:
+ assert clean_url_conservative("") == None
+ assert clean_url_conservative(None) == None
+ assert clean_url_conservative("<http://en.wikipedia.org/wiki/Rumpelstiltskin>") == \
+ "http://en.wikipedia.org/wiki/Rumpelstiltskin"
+ assert clean_url_conservative("<http://en.wikipedia.org/wiki/Baiji>.Acessoem") == \
+ "http://en.wikipedia.org/wiki/Baiji"
+ assert clean_url_conservative("Available:en.m.wikipedia.org/wiki/Jigawa_State") == \
+ "Available:en.m.wikipedia.org/wiki/Jigawa_State"
+
+
UNWANTED_ABSTRACT_PREFIXES = [
# roughly sort this long to short
"Abstract No Abstract ",