diff options
Diffstat (limited to 'skate/unstructured.go')
-rw-r--r-- | skate/unstructured.go | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/skate/unstructured.go b/skate/unstructured.go new file mode 100644 index 0000000..082c685 --- /dev/null +++ b/skate/unstructured.go @@ -0,0 +1,66 @@ +package skate + +import ( + "regexp" + "strings" +) + +var ( + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) + PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + + urlPrefixes = []string{ + "http://doi.org/", + "https://doi.org/", + "http://dx.doi.org/", + "https://dx.doi.org/", + } +) + +// ParseUnstructured will in-place augment missing DOI, arxiv id and so on. +func ParseUnstructured(ref *Ref) error { + var ( + uns = ref.Biblio.Unstructured + v string + vs []string + ) + // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, + // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... + if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { + parts := strings.Split(strings.ToLower(ref.Key), "-bib") + ref.Biblio.DOI = parts[0] + } + // DOI + v = PatDOI.FindString(uns) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in Key + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in URL + for _, prefix := range urlPrefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // Arxiv + vs = PatArxivPDF.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } else { + vs = PatArxivAbs.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } + } + return nil +} |