aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-25 16:36:52 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-25 16:36:52 -0700
commit0d4c3ca311b1057bdb07144b0ac8ba860be2de55 (patch)
tree8efe9f4eb161a1959c5e3fc900329e2a05abf1a6 /skate
parent9df1ff863eb9729faa9b46effb460c74203969f7 (diff)
downloadrefcat-0d4c3ca311b1057bdb07144b0ac8ba860be2de55.tar.gz
refcat-0d4c3ca311b1057bdb07144b0ac8ba860be2de55.zip
skate: use SanitizeDOI in all inputs
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go2
-rw-r--r--skate/schema.go4
-rw-r--r--skate/unstructured.go21
-rw-r--r--skate/unstructured_test.go4
4 files changed, 9 insertions, 22 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index a6d82c0..3f7afde 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -39,7 +39,7 @@ func main() {
return nil, nil
}
var (
- doi = wsReplacer.Replace(match[0])
+ doi = skate.SanitizeDOI(wsReplacer.Replace(match[0]))
pageTitle = strings.TrimSpace(w.PageTitle)
s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p))
)
diff --git a/skate/schema.go b/skate/schema.go
index f36815f..d6b4ded 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -94,7 +94,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
release.Ident = ref.ReleaseIdent
release.WorkID = ref.WorkIdent
release.ExtIDs.Arxiv = b.ArxivId
- release.ExtIDs.DOI = b.DOI
+ release.ExtIDs.DOI = SanitizeDOI(b.DOI)
release.ExtIDs.PMID = b.PMID
release.ExtIDs.PMCID = b.PMCID
release.Title = b.Title
@@ -616,7 +616,7 @@ func (c *MinimalCitations) ParseIDList() (result IDList) {
case "ISBN":
result.ISBN = pair[1]
case "DOI":
- result.DOI = pair[1]
+ result.DOI = SanitizeDOI(pair[1])
case "PMID":
result.PMID = pair[1]
case "ISSN":
diff --git a/skate/unstructured.go b/skate/unstructured.go
index 39821a1..a172e8b 100644
--- a/skate/unstructured.go
+++ b/skate/unstructured.go
@@ -2,19 +2,12 @@ package skate
import (
"regexp"
- "strings"
)
var (
- PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
- PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
- PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
- DOILinkPrefixes = []string{
- "http://doi.org/",
- "http://dx.doi.org/",
- "https://doi.org/",
- "https://dx.doi.org/",
- }
+ PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+ PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+ PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
)
// ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
@@ -27,13 +20,7 @@ func ParseUnstructured(ref *Ref) error {
// DOI
v = PatDOI.FindString(uns)
if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // DOI in URL
- for _, prefix := range DOILinkPrefixes {
- if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
- ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
- }
+ ref.Biblio.DOI = SanitizeDOI(v)
}
// Arxiv
vs = PatArxiv.FindStringSubmatch(uns)
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
index 92f1d80..1727430 100644
--- a/skate/unstructured_test.go
+++ b/skate/unstructured_test.go
@@ -20,7 +20,7 @@ func TestParseUnstructured(t *testing.T) {
},
&Ref{
Biblio: Biblio{
- DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5",
Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
},
},
@@ -35,7 +35,7 @@ func TestParseUnstructured(t *testing.T) {
&Ref{
Biblio: Biblio{
ArxivId: "0808.3320",
- DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5",
Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
},
},