From 338d6a6c4554a9f7afcb2f572943f276f7912995 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 1 Apr 2021 00:57:17 +0200 Subject: find doi in url --- skate/cmd/skate-from-unstructured/main.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'skate/cmd/skate-from-unstructured') diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 0208d91..8ebc613 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -33,16 +33,10 @@ func main() { if err := json.Unmarshal(p, &ref); err != nil { return nil, err } - // TODO: ref if err := parseUnstructured(&ref); err != nil { return nil, err } - b, err := json.Marshal(ref) - if err != nil { - return nil, err - } - b = append(b, bytesNewline...) - return b, nil + return skate.JsonMarshalLine(&ref) }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize @@ -74,6 +68,17 @@ func parseUnstructured(ref *skate.Ref) error { if v != "" && ref.Biblio.DOI == "" { ref.Biblio.DOI = v } + // DOI in URL + prefixes := []string{"http://doi.org/", "https://doi.org/", "http://dx.doi.org/", "https://dx.doi.org/"} + for _, prefix := range prefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } // Arxiv vs = PatArxivPDF.FindStringSubmatch(uns) if len(vs) != 0 && ref.Biblio.ArxivId == "" { -- cgit v1.2.3