diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:30:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:30:12 -0700 |
commit | 7a8518adae2997a507e21eae6d6a99b25b03c52d (patch) | |
tree | 7d0ff2966a2cb87603210fee57dd8ae100cc6003 | |
parent | 54a641129317372a2fcdb3c4c44d319ecb0bc0fc (diff) | |
download | fatcat-scholar-7a8518adae2997a507e21eae6d6a99b25b03c52d.tar.gz fatcat-scholar-7a8518adae2997a507e21eae6d6a99b25b03c52d.zip |
URL cleanup helper
-rw-r--r-- | fatcat_scholar/schema.py | 28 |
1 files changed, 28 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index f171716..d75dae8 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -248,6 +248,34 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]: return None +def clean_url_conservative(url: Optional[str]) -> Optional[str]: + """ + Takes a string which is expected to be a URL, and does some light cleanups. + If the string looks messy, passes it through anyways for downstream + processing. + + TODO: attempt URL decoding + """ + if not url: + return None + if url.startswith("<"): + url = url[1:] + if ">" in url: + url = url.split(">")[0] + return url + + +def test_clean_url_conservative() -> None: + assert clean_url_conservative("") == None + assert clean_url_conservative(None) == None + assert clean_url_conservative("<http://en.wikipedia.org/wiki/Rumpelstiltskin>") == \ + "http://en.wikipedia.org/wiki/Rumpelstiltskin" + assert clean_url_conservative("<http://en.wikipedia.org/wiki/Baiji>.Acessoem") == \ + "http://en.wikipedia.org/wiki/Baiji" + assert clean_url_conservative("Available:en.m.wikipedia.org/wiki/Jigawa_State") == \ + "Available:en.m.wikipedia.org/wiki/Jigawa_State" + + UNWANTED_ABSTRACT_PREFIXES = [ # roughly sort this long to short "Abstract No Abstract ", |