Merge branch 'bnewbold-skate-tweaks' into 'master'

proposed changes and fixes to skate matching See merge request martin/cgraph!3
author: Martin Czygan <martin@archive.org> 2021-07-26 17:43:04 +0000
committer: Martin Czygan <martin@archive.org> 2021-07-26 17:43:04 +0000
commit: aeaa60211e33cb49da98770b3461cbca2c2a65cc (patch)
tree: 1def8dfcd4d2c035a8b5ee6d88507a1ad53a8b40 /skate
parent: befd7895262e2469367e2a4f71f78148b9986dee (diff)
parent: 0d4c3ca311b1057bdb07144b0ac8ba860be2de55 (diff)
download: refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.tar.gz
refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.zip
9 files changed, 116 insertions, 54 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index a6d82c0..3f7afde 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -39,7 +39,7 @@ func main() {
 			return nil, nil
 		}
 		var (
-			doi       = wsReplacer.Replace(match[0])
+			doi       = skate.SanitizeDOI(wsReplacer.Replace(match[0]))
 			pageTitle = strings.TrimSpace(w.PageTitle)
 			s         = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p))
 		)
diff --git a/skate/doi.go b/skate/doi.go
new file mode 100644
index 0000000..8f6049e
--- /dev/null
+++ b/skate/doi.go
@@ -0,0 +1,39 @@
+package skate
+
+import (
+	"strings"
+)
+
+// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a
+// re-implementation of the simple 'clean_doi()' python function.
+// It should handle DOI URLs, prefixes, and some forms of mangling, though it
+// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled
+// suffixes).
+// At least lower-cases all DOIs, for more permissive matching.
+// Does not validate or convert non-ASCII characters.
+// Intended to be performant and used liberally; does not execute any regexes.
+// Returns empty string if the input is definitely not a DOI, though is
+// relatively permissive and does little validation.
+func SanitizeDOI(raw string) string {
+	// short-circuits
+	if len(raw) < 8 || !strings.Contains(raw, "10.") {
+		return ""
+	}
+
+	// lower-case and trim whitespace
+	raw = strings.ToLower(strings.TrimSpace(raw))
+
+	// if doesn't start with 10., strip any prefix
+	start := strings.Index(raw, "10.")
+	if start == -1 {
+		return ""
+	} else if start > 0 {
+		raw = raw[start:len(raw)]
+	}
+
+	// final simple checks
+	if len(raw) < 8 || !strings.Contains(raw, "/") {
+		return ""
+	}
+	return raw
+}
diff --git a/skate/doi_test.go b/skate/doi_test.go
new file mode 100644
index 0000000..7a184d3
--- /dev/null
+++ b/skate/doi_test.go
@@ -0,0 +1,32 @@
+package skate
+
+import "testing"
+
+func TestSanitizeDOI(t *testing.T) {
+	var cases = []struct {
+		in  string
+		out string
+	}{
+		{"", ""},
+		{"a", ""},
+		{"???", ""},
+		{"10.1234", ""},
+		{"10.1234/asdf ", "10.1234/asdf"},
+		{"10.1234/ASDF", "10.1234/asdf"},
+		{"10.1037/0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"},
+		{"http://doi.org/10.1234/asdf ", "10.1234/asdf"},
+		{"http://doi.org/10.123", ""},
+		{"dx.doi.org/10.1234/asdf ", "10.1234/asdf"},
+		{"21924DOI10.1234/asdf ", "10.1234/asdf"},
+		{"https://dx.doi.org/10.1234/asdf ", "10.1234/asdf"},
+		{"doi:10.1234/asdf ", "10.1234/asdf"},
+		{"10.7326/M20-6817", "10.7326/m20-6817"},
+		// TODO: {"10.1037//0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"},
+	}
+	for _, c := range cases {
+		out := SanitizeDOI(c.in)
+		if out != c.out {
+			t.Fatalf("got %v, want %v", out, c.out)
+		}
+	}
+}
diff --git a/skate/reduce.go b/skate/reduce.go
index e2fa130..76b511e 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -408,6 +408,7 @@ func ZippyWayback(refs, cdx io.Reader, w io.Writer) error {
 						cdx.Summary.Ok, cdx.Line)
 				}
 			}
+			bref.MatchProvenance = ref.RefSource
 			bref.MatchStatus = StatusExact.Short()
 			bref.MatchReason = ReasonURLMatch.Short()
 			if err := enc.Encode(bref); err != nil {
@@ -574,16 +575,19 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [
 			var (
 				authors []CSLAuthor
 				isbn    string
-				year    string
+				issued  *CSLDate
 			)
-			for _, name := range r.Biblio.ContribRawNames {
-				authors = append(authors, CSLAuthor{Name: name})
+			for _, raw_name := range r.Biblio.ContribRawNames {
+				authors = append(authors, CSLAuthor{RawName: raw_name})
 			}
 			if len(r.Biblio.Extra.ISBN) > 0 {
 				isbn = r.Biblio.Extra.ISBN[0]
 			}
-			if r.Biblio.Year > 1500 && r.Biblio.Year < 2022 {
-				year = fmt.Sprintf("%d", r.Biblio.Year)
+			// TODO: need to update this "max year" number frequently?
+			if r.Biblio.Year > 1500 && r.Biblio.Year <= 2025 {
+				issued = &CSLDate{Parts: [][]int{{int(r.Biblio.Year)}}}
+			} else {
+				issued = &CSLDate{}
 			}
 			bref.TargetCSL = &CSL{
 				Author:         authors,
@@ -598,12 +602,11 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [
 				Title:          r.Biblio.Title,
 				URL:            r.Biblio.Url,
 				Volume:         r.Biblio.Volume,
-				Issued: CSLDate{
-					Raw: year,
-				},
+				Issued:         issued,
 			}
 		}
 		// Reuse fields for debugging, for now.
+		bref.MatchProvenance = r.RefSource
 		bref.MatchStatus = StatusUnmatched.Short()
 		bref.MatchReason = ReasonUnknown.Short()
 		matched = append(matched, &bref)
diff --git a/skate/reduce_test.go b/skate/reduce_test.go
index 9c134f8..7cde68f 100644
--- a/skate/reduce_test.go
+++ b/skate/reduce_test.go
@@ -199,7 +199,7 @@ func TestMatchedRefsExtend(t *testing.T) {
 					MatchReason:        ReasonUnknown.Short(),
 					SourceYear:         "0",
 					TargetCSL: &CSL{
-						Accessed:            CSLDate{},
+						Accessed:            nil,
 						Author:              nil,
 						CollectionTitle:     "",
 						ContainerTitle:      "",
@@ -209,7 +209,7 @@ func TestMatchedRefsExtend(t *testing.T) {
 						ISBN:                "",
 						ISSN:                "",
 						Issue:               "",
-						Issued:              CSLDate{},
+						Issued:              &CSLDate{},
 						JournalAbbreviation: "",
 						Language:            "",
 						NumberOfPages:       "",
diff --git a/skate/schema.go b/skate/schema.go
index 93c9680..d6b4ded 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -94,7 +94,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
 	release.Ident = ref.ReleaseIdent
 	release.WorkID = ref.WorkIdent
 	release.ExtIDs.Arxiv = b.ArxivId
-	release.ExtIDs.DOI = b.DOI
+	release.ExtIDs.DOI = SanitizeDOI(b.DOI)
 	release.ExtIDs.PMID = b.PMID
 	release.ExtIDs.PMCID = b.PMCID
 	release.Title = b.Title
@@ -431,7 +431,7 @@ type BiblioRef struct {
 // https://github.com/citation-style-language/schema,
 // https://navneethg.github.io/jsonschemaviewer/. This is a subset only.
 type CSL struct {
-	Accessed            CSLDate     `json:"accessed,omitempty"`
+	Accessed            *CSLDate    `json:"accessed,omitempty"`
 	Author              []CSLAuthor `json:"author,omitempty"`
 	CollectionTitle     string      `json:"collection-title,omitempty"`
 	ContainerTitle      string      `json:"container-title,omitempty"`
@@ -441,7 +441,7 @@ type CSL struct {
 	ISBN                string      `json:"ISBN,omitempty"`
 	ISSN                string      `json:"ISSN,omitempty"`
 	Issue               string      `json:"issue,omitempty"`
-	Issued              CSLDate     `json:"issued,omitempty"`
+	Issued              *CSLDate    `json:"issued,omitempty"`
 	JournalAbbreviation string      `json:"journalAbbreviation,omitempty"`
 	Language            string      `json:"language,omitempty"`
 	NumberOfPages       string      `json:"number-of-pages,omitempty"`
@@ -481,12 +481,12 @@ type CSLDate struct {
 // family element. Institutional names may be delivered in the same way, but it
 // is preferred to set them instead as a literal element.
 //
-// We include Name, for holding unparsed name, which is not a literal.
+// We include RawName, for holding unparsed name, which is not a literal.
 type CSLAuthor struct {
 	Family  string `json:"family,omitempty"`
 	Given   string `json:"given,omitempty"`
 	Literal string `json:"literal,omitempty"`
-	Name    string `json:"name,omitempty"`
+	RawName string `json:"raw_name,omitempty"`
 }
 
 func (b *BiblioRef) Reset() {
@@ -616,7 +616,7 @@ func (c *MinimalCitations) ParseIDList() (result IDList) {
 		case "ISBN":
 			result.ISBN = pair[1]
 		case "DOI":
-			result.DOI = pair[1]
+			result.DOI = SanitizeDOI(pair[1])
 		case "PMID":
 			result.PMID = pair[1]
 		case "ISSN":
diff --git a/skate/schema_test.go b/skate/schema_test.go
index 3267072..59b1f58 100644
--- a/skate/schema_test.go
+++ b/skate/schema_test.go
@@ -1,6 +1,7 @@
 package skate
 
 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"reflect"
@@ -220,11 +221,11 @@ func TestLinkHash(t *testing.T) {
 	}{
 		{
 			bref:     BiblioRef{},
-			linkHash: "7cae9fc61f167bc26cc3839f15457fe87b2be4e1",
+			linkHash: "8b8c3f74dd1472aa8869ee3a58295b70c7064aa8",
 		},
 		{
 			bref:     BiblioRef{SourceReleaseIdent: "123"},
-			linkHash: "a0969f96c14cb42d298117e1927bd409873173a2",
+			linkHash: "23d0f9e279ec533f46a6b220f7a5758ec0c9d9af",
 		},
 		{
 			bref: BiblioRef{
@@ -256,6 +257,22 @@ func TestLinkHash(t *testing.T) {
 	}
 }
 
+func TestSchemaMarshal(t *testing.T) {
+
+	// CSL when issued is empty, but accepted is not
+	var csl = CSL{
+		Title: "test-doc",
+		Issued: &CSLDate{
+			Parts: [][]int{{2012}},
+		},
+	}
+	var csl_json = []byte(`{"issued":{"date-parts":[[2012]]},"title":"test-doc"}`)
+	var csl_encoded, _ = json.Marshal(csl)
+	if bytes.Compare(csl_json, csl_encoded) != 0 {
+		t.Fatalf("got:\n%v\nwant:\n%v\n", string(csl_json[:]), string(csl_encoded[:]))
+	}
+}
+
 func TestReleaseToUnstructured(t *testing.T) {
 	var cases = []struct {
 		r *Release
diff --git a/skate/unstructured.go b/skate/unstructured.go
index f2c1d21..a172e8b 100644
--- a/skate/unstructured.go
+++ b/skate/unstructured.go
@@ -2,19 +2,12 @@ package skate
 
 import (
 	"regexp"
-	"strings"
 )
 
 var (
-	PatDOI          = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
-	PatDOINoHyphen  = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
-	PatArxiv        = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
-	DOILinkPrefixes = []string{
-		"http://doi.org/",
-		"http://dx.doi.org/",
-		"https://doi.org/",
-		"https://dx.doi.org/",
-	}
+	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+	PatArxiv       = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
 )
 
 // ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
@@ -24,32 +17,10 @@ func ParseUnstructured(ref *Ref) error {
 		v   string
 		vs  []string
 	)
-	// Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
-	// 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
-	if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
-		parts := strings.Split(strings.ToLower(ref.Key), "-bib")
-		ref.Biblio.DOI = parts[0]
-	}
 	// DOI
 	v = PatDOI.FindString(uns)
 	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in Key
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in URL
-	for _, prefix := range DOILinkPrefixes {
-		if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
-			ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
-		}
-	}
-	// Another DOI pattern.
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
+		ref.Biblio.DOI = SanitizeDOI(v)
 	}
 	// Arxiv
 	vs = PatArxiv.FindStringSubmatch(uns)
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
index 92f1d80..1727430 100644
--- a/skate/unstructured_test.go
+++ b/skate/unstructured_test.go
@@ -20,7 +20,7 @@ func TestParseUnstructured(t *testing.T) {
 			},
 			&Ref{
 				Biblio: Biblio{
-					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-bib5",
 					Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
 				},
 			},
@@ -35,7 +35,7 @@ func TestParseUnstructured(t *testing.T) {
 			&Ref{
 				Biblio: Biblio{
 					ArxivId:      "0808.3320",
-					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-bib5",
 					Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
 				},
 			},
author	Martin Czygan <martin@archive.org>	2021-07-26 17:43:04 +0000
committer	Martin Czygan <martin@archive.org>	2021-07-26 17:43:04 +0000
commit	aeaa60211e33cb49da98770b3461cbca2c2a65cc (patch)
tree	1def8dfcd4d2c035a8b5ee6d88507a1ad53a8b40 /skate
parent	befd7895262e2469367e2a4f71f78148b9986dee (diff)
parent	0d4c3ca311b1057bdb07144b0ac8ba860be2de55 (diff)
download	refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.tar.gz refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.zip