aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-01 16:26:13 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-01 16:26:13 -0700
commit839c4e7f3187b4eed4b5adbb9212a9a9456bf16f (patch)
tree0dd014ce688fd70a36be602582d4ac54e2b8b3af /fuzzycat/utils.py
parentb625155d565367141f7fbe0d5e507b9dc98ce4df (diff)
downloadfuzzycat-839c4e7f3187b4eed4b5adbb9212a9a9456bf16f.tar.gz
fuzzycat-839c4e7f3187b4eed4b5adbb9212a9a9456bf16f.zip
DOI clean/normalize helper; and use in verification etc
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py14
1 files changed, 14 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index bdca7b6..a1c5124 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -6,6 +6,7 @@ import re
import string
import subprocess
import tempfile
+from typing import Optional
import requests
from glom import PathAccessError, glom
@@ -80,6 +81,19 @@ def dict_key_exists(doc, path):
else:
return True
+def clean_doi(raw: Optional[str]) -> Optional[str]:
+ if not raw:
+ return None
+ raw = raw.strip().lower()
+ if raw.startswith("doi:"):
+ raw = raw[4:]
+ if not "10." in raw:
+ return None
+ if not raw.startswith("10."):
+ raw = raw[raw.find("10."):]
+ if raw[7:9] == "//":
+ raw = raw[:8] + raw[9:]
+ return raw
def doi_prefix(v):
"""