From 7a8518adae2997a507e21eae6d6a99b25b03c52d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 13 Sep 2020 23:30:12 -0700 Subject: URL cleanup helper --- fatcat_scholar/schema.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'fatcat_scholar') diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index f171716..d75dae8 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -248,6 +248,34 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]: return None +def clean_url_conservative(url: Optional[str]) -> Optional[str]: + """ + Takes a string which is expected to be a URL, and does some light cleanups. + If the string looks messy, passes it through anyways for downstream + processing. + + TODO: attempt URL decoding + """ + if not url: + return None + if url.startswith("<"): + url = url[1:] + if ">" in url: + url = url.split(">")[0] + return url + + +def test_clean_url_conservative() -> None: + assert clean_url_conservative("") == None + assert clean_url_conservative(None) == None + assert clean_url_conservative("") == \ + "http://en.wikipedia.org/wiki/Rumpelstiltskin" + assert clean_url_conservative(".Acessoem") == \ + "http://en.wikipedia.org/wiki/Baiji" + assert clean_url_conservative("Available:en.m.wikipedia.org/wiki/Jigawa_State") == \ + "Available:en.m.wikipedia.org/wiki/Jigawa_State" + + UNWANTED_ABSTRACT_PREFIXES = [ # roughly sort this long to short "Abstract No Abstract ", -- cgit v1.2.3