diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-04-01 00:57:17 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-04-01 00:57:17 +0200 |
commit | 338d6a6c4554a9f7afcb2f572943f276f7912995 (patch) | |
tree | 56e526cfc7a0892375fea9eeff54ac8e9922e0b5 /skate | |
parent | 304a994951daf3930d0951b80c7ba22103f3a7f0 (diff) | |
download | refcat-338d6a6c4554a9f7afcb2f572943f276f7912995.tar.gz refcat-338d6a6c4554a9f7afcb2f572943f276f7912995.zip |
find doi in url
Diffstat (limited to 'skate')
-rw-r--r-- | skate/README.md | 3 | ||||
-rw-r--r-- | skate/cmd/skate-from-unstructured/main.go | 19 | ||||
-rw-r--r-- | skate/verify.go | 6 |
3 files changed, 18 insertions, 10 deletions
diff --git a/skate/README.md b/skate/README.md index 1962dc6..cc7e238 100644 --- a/skate/README.md +++ b/skate/README.md @@ -56,6 +56,9 @@ After this step: * cluster, e.g. `skate-cluster ...` ### skate-from-unstructured + + + ### skate-ref-to-release ### skate-to-doi ### skate-verify diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 0208d91..8ebc613 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -33,16 +33,10 @@ func main() { if err := json.Unmarshal(p, &ref); err != nil { return nil, err } - // TODO: ref if err := parseUnstructured(&ref); err != nil { return nil, err } - b, err := json.Marshal(ref) - if err != nil { - return nil, err - } - b = append(b, bytesNewline...) - return b, nil + return skate.JsonMarshalLine(&ref) }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize @@ -74,6 +68,17 @@ func parseUnstructured(ref *skate.Ref) error { if v != "" && ref.Biblio.DOI == "" { ref.Biblio.DOI = v } + // DOI in URL + prefixes := []string{"http://doi.org/", "https://doi.org/", "http://dx.doi.org/", "https://dx.doi.org/"} + for _, prefix := range prefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } // Arxiv vs = PatArxivPDF.FindStringSubmatch(uns) if len(vs) != 0 && ref.Biblio.ArxivId == "" { diff --git a/skate/verify.go b/skate/verify.go index cfe31ec..40ee4e2 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -123,8 +123,8 @@ var ( PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) ) -// jsonMarshalLine marshals a value as JSON and adds a newline. -func jsonMarshalLine(v interface{}) ([]byte, error) { +// JsonMarshalLine marshals a value as JSON and adds a newline. +func JsonMarshalLine(v interface{}) ([]byte, error) { b, err := json.Marshal(v) if err != nil { return nil, err @@ -230,7 +230,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { continue // Assume we already have the DOI matches. } br = generateBiblioRef(re, pivot, result, "fuzzy") - return jsonMarshalLine(br) + return JsonMarshalLine(br) default: continue } |