aboutsummaryrefslogtreecommitdiffstats
path: root/skate/unstructured.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/unstructured.go')
-rw-r--r--skate/unstructured.go66
1 files changed, 66 insertions, 0 deletions
diff --git a/skate/unstructured.go b/skate/unstructured.go
new file mode 100644
index 0000000..082c685
--- /dev/null
+++ b/skate/unstructured.go
@@ -0,0 +1,66 @@
+package skate
+
+import (
+ "regexp"
+ "strings"
+)
+
+var (
+ PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+ PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+ PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+ PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+
+ urlPrefixes = []string{
+ "http://doi.org/",
+ "https://doi.org/",
+ "http://dx.doi.org/",
+ "https://dx.doi.org/",
+ }
+)
+
+// ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
+func ParseUnstructured(ref *Ref) error {
+ var (
+ uns = ref.Biblio.Unstructured
+ v string
+ vs []string
+ )
+ // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
+ // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
+ if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
+ parts := strings.Split(strings.ToLower(ref.Key), "-bib")
+ ref.Biblio.DOI = parts[0]
+ }
+ // DOI
+ v = PatDOI.FindString(uns)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // DOI in Key
+ v = PatDOINoHyphen.FindString(ref.Key)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // DOI in URL
+ for _, prefix := range urlPrefixes {
+ if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
+ ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
+ }
+ }
+ v = PatDOINoHyphen.FindString(ref.Key)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // Arxiv
+ vs = PatArxivPDF.FindStringSubmatch(uns)
+ if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+ ref.Biblio.ArxivId = vs[1]
+ } else {
+ vs = PatArxivAbs.FindStringSubmatch(uns)
+ if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+ ref.Biblio.ArxivId = vs[1]
+ }
+ }
+ return nil
+}