From 338d6a6c4554a9f7afcb2f572943f276f7912995 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 1 Apr 2021 00:57:17 +0200 Subject: find doi in url --- skate/README.md | 3 +++ skate/cmd/skate-from-unstructured/main.go | 19 ++++++++++++------- skate/verify.go | 6 +++--- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'skate') diff --git a/skate/README.md b/skate/README.md index 1962dc6..cc7e238 100644 --- a/skate/README.md +++ b/skate/README.md @@ -56,6 +56,9 @@ After this step: * cluster, e.g. `skate-cluster ...` ### skate-from-unstructured + + + ### skate-ref-to-release ### skate-to-doi ### skate-verify diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 0208d91..8ebc613 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -33,16 +33,10 @@ func main() { if err := json.Unmarshal(p, &ref); err != nil { return nil, err } - // TODO: ref if err := parseUnstructured(&ref); err != nil { return nil, err } - b, err := json.Marshal(ref) - if err != nil { - return nil, err - } - b = append(b, bytesNewline...) - return b, nil + return skate.JsonMarshalLine(&ref) }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize @@ -74,6 +68,17 @@ func parseUnstructured(ref *skate.Ref) error { if v != "" && ref.Biblio.DOI == "" { ref.Biblio.DOI = v } + // DOI in URL + prefixes := []string{"http://doi.org/", "https://doi.org/", "http://dx.doi.org/", "https://dx.doi.org/"} + for _, prefix := range prefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } // Arxiv vs = PatArxivPDF.FindStringSubmatch(uns) if len(vs) != 0 && ref.Biblio.ArxivId == "" { diff --git a/skate/verify.go b/skate/verify.go index cfe31ec..40ee4e2 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -123,8 +123,8 @@ var ( PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) ) -// jsonMarshalLine marshals a value as JSON and adds a newline. -func jsonMarshalLine(v interface{}) ([]byte, error) { +// JsonMarshalLine marshals a value as JSON and adds a newline. +func JsonMarshalLine(v interface{}) ([]byte, error) { b, err := json.Marshal(v) if err != nil { return nil, err @@ -230,7 +230,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { continue // Assume we already have the DOI matches. } br = generateBiblioRef(re, pivot, result, "fuzzy") - return jsonMarshalLine(br) + return JsonMarshalLine(br) default: continue } -- cgit v1.2.3