summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/schema.py28
1 files changed, 28 insertions, 0 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index f171716..d75dae8 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -248,6 +248,34 @@ def release_doi_registrar(release: ReleaseEntity) -> Optional[str]:
return None
+def clean_url_conservative(url: Optional[str]) -> Optional[str]:
+ """
+ Takes a string which is expected to be a URL, and does some light cleanups.
+ If the string looks messy, passes it through anyways for downstream
+ processing.
+
+ TODO: attempt URL decoding
+ """
+ if not url:
+ return None
+ if url.startswith("<"):
+ url = url[1:]
+ if ">" in url:
+ url = url.split(">")[0]
+ return url
+
+
+def test_clean_url_conservative() -> None:
+ assert clean_url_conservative("") == None
+ assert clean_url_conservative(None) == None
+ assert clean_url_conservative("<http://en.wikipedia.org/wiki/Rumpelstiltskin>") == \
+ "http://en.wikipedia.org/wiki/Rumpelstiltskin"
+ assert clean_url_conservative("<http://en.wikipedia.org/wiki/Baiji>.Acessoem") == \
+ "http://en.wikipedia.org/wiki/Baiji"
+ assert clean_url_conservative("Available:en.m.wikipedia.org/wiki/Jigawa_State") == \
+ "Available:en.m.wikipedia.org/wiki/Jigawa_State"
+
+
UNWANTED_ABSTRACT_PREFIXES = [
# roughly sort this long to short
"Abstract No Abstract ",