diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-13 15:53:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-10-13 16:21:31 -0700 |
commit | a2799486ed1c43b95bef036375023d225c482bab (patch) | |
tree | ebf920be2ad045e9789eb3cbabef13b0bfd0510a /python/fatcat_tools | |
parent | e3f892877222309db1c98009d766c658bcb913bb (diff) | |
download | fatcat-a2799486ed1c43b95bef036375023d225c482bab.tar.gz fatcat-a2799486ed1c43b95bef036375023d225c482bab.zip |
python: normalization/validation support for handle identifiers (hdl)
Diffstat (limited to 'python/fatcat_tools')
-rw-r--r-- | python/fatcat_tools/normal.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/python/fatcat_tools/normal.py b/python/fatcat_tools/normal.py index 342edeef..e37cace8 100644 --- a/python/fatcat_tools/normal.py +++ b/python/fatcat_tools/normal.py @@ -276,6 +276,39 @@ def test_clean_orcid(): assert clean_orcid("0x23-4567-3456-6780") == None +HDL_REGEX = re.compile(r"^\d+(\.\d+)*/\S+$") + +def clean_hdl(raw): + if not raw: + return None + raw = raw.strip().lower() + if raw.startswith("hdl:"): + raw = raw[4:] + if raw.startswith("http://"): + raw = raw[7:] + if raw.startswith("https://"): + raw = raw[8:] + if raw.startswith("hdl.handle.net/"): + raw = raw[15:] + if not HDL_REGEX.fullmatch(raw): + return None + if raw.startswith('10.'): + return None + return raw + +def test_clean_hdl(): + assert clean_hdl("20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert clean_hdl("hdl:20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert clean_hdl("https://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert clean_hdl("http://hdl.handle.net/20.500.23456/ABC/DUMMY") == "20.500.23456/abc/dummy" + assert clean_hdl("21.1234/aksjdfh") == "21.1234/aksjdfh" + assert clean_hdl("2381/12775") == "2381/12775" + assert clean_hdl("10.1234/aksjdfh") == None + assert clean_hdl("20.1234") == None + assert clean_hdl("20.1234/") == None + assert clean_hdl("20./asdf") == None + + def clean_str(thing: Optional[str], force_xml: bool = False) -> Optional[str]: """ This function is appropriate to be called on any random, non-markup string, |