summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2021-10-13 15:53:01 -0700
committerBryan Newbold <bnewbold@robocracy.org>2021-10-13 16:21:31 -0700
commita2799486ed1c43b95bef036375023d225c482bab (patch)
treeebf920be2ad045e9789eb3cbabef13b0bfd0510a
parente3f892877222309db1c98009d766c658bcb913bb (diff)
downloadfatcat-a2799486ed1c43b95bef036375023d225c482bab.tar.gz
fatcat-a2799486ed1c43b95bef036375023d225c482bab.zip
python: normalization/validation support for handle identifiers (hdl)
-rw-r--r--python/fatcat_tools/normal.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py
index 342edeef..e37cace8 100644
--- a/python/fatcat_tools/normal.py
+++ b/python/fatcat_tools/normal.py
@@ -276,6 +276,39 @@ def test_clean_orcid():
assert clean_orcid("0x23-4567-3456-6780") == None
+HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$")
+
+def clean_hdl(raw):
+ if not raw:
+ return None
+ raw = raw.strip().lower()
+ if raw.startswith("hdl:"):
+ raw = raw[4:]
+ if raw.startswith("http://"):
+ raw = raw[7:]
+ if raw.startswith("https://"):
+ raw = raw[8:]
+ if raw.startswith("hdl.handle.net/"):
+ raw = raw[15:]
+ if not HDL_REGEX.fullmatch(raw):
+ return None
+ if raw.startswith('10.'):
+ return None
+ return raw
+
+def test_clean_hdl():
+ assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy"
+ assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh"
+ assert clean_hdl("2381/12775") == "2381/12775"
+ assert clean_hdl("10.1234/aksjdfh") == None
+ assert clean_hdl("20.1234") == None
+ assert clean_hdl("20.1234/") == None
+ assert clean_hdl("20./asdf") == None
+
+
def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]:
"""
This function is appropriate to be called on any random, non-markup string,