From f5575f41f799be9bdfd9d0406710aeebe20d6350 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 24 Jul 2021 18:12:53 -0700 Subject: add test for issued,accessed not being included in output JSON --- skate/schema_test.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'skate') diff --git a/skate/schema_test.go b/skate/schema_test.go index 3267072..7616001 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -1,6 +1,7 @@ package skate import ( + "bytes" "encoding/json" "fmt" "reflect" @@ -256,6 +257,22 @@ func TestLinkHash(t *testing.T) { } } +func TestSchemaMarshal(t *testing.T) { + + // CSL when issued is empty, but accepted is not + var csl = CSL{ + Title: "test-doc", + Issued: &CSLDate{ + Raw: "2012", + }, + } + var csl_json = []byte(`{"issued":{"raw":"2012"},"title":"test-doc"}`) + var csl_encoded, _ = json.Marshal(csl) + if bytes.Compare(csl_json, csl_encoded) != 0 { + t.Fatalf("got:\n%v\nwant:\n%v\n", string(csl_json[:]), string(csl_encoded[:])) + } +} + func TestReleaseToUnstructured(t *testing.T) { var cases = []struct { r *Release -- cgit v1.2.3 From b2e4e4242c9d8d4fbdc026a80dfefa86697a5649 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sat, 24 Jul 2021 18:13:33 -0700 Subject: schema: have issued+accessed (CSLDate) actually omitempty Similar to TargetCSL, these should be pointer types so they don't get encoded as empty objects when not set. --- skate/reduce.go | 2 +- skate/reduce_test.go | 4 ++-- skate/schema.go | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index e2fa130..356ed25 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -598,7 +598,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ Title: r.Biblio.Title, URL: r.Biblio.Url, Volume: r.Biblio.Volume, - Issued: CSLDate{ + Issued: &CSLDate{ Raw: year, }, } diff --git a/skate/reduce_test.go b/skate/reduce_test.go index 9c134f8..7cde68f 100644 --- a/skate/reduce_test.go +++ b/skate/reduce_test.go @@ -199,7 +199,7 @@ func TestMatchedRefsExtend(t *testing.T) { MatchReason: ReasonUnknown.Short(), SourceYear: "0", TargetCSL: &CSL{ - Accessed: CSLDate{}, + Accessed: nil, Author: nil, CollectionTitle: "", ContainerTitle: "", @@ -209,7 +209,7 @@ func TestMatchedRefsExtend(t *testing.T) { ISBN: "", ISSN: "", Issue: "", - Issued: CSLDate{}, + Issued: &CSLDate{}, JournalAbbreviation: "", Language: "", NumberOfPages: "", diff --git a/skate/schema.go b/skate/schema.go index 93c9680..50f52d6 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -431,7 +431,7 @@ type BiblioRef struct { // https://github.com/citation-style-language/schema, // https://navneethg.github.io/jsonschemaviewer/. This is a subset only. type CSL struct { - Accessed CSLDate `json:"accessed,omitempty"` + Accessed *CSLDate `json:"accessed,omitempty"` Author []CSLAuthor `json:"author,omitempty"` CollectionTitle string `json:"collection-title,omitempty"` ContainerTitle string `json:"container-title,omitempty"` @@ -441,7 +441,7 @@ type CSL struct { ISBN string `json:"ISBN,omitempty"` ISSN string `json:"ISSN,omitempty"` Issue string `json:"issue,omitempty"` - Issued CSLDate `json:"issued,omitempty"` + Issued *CSLDate `json:"issued,omitempty"` JournalAbbreviation string `json:"journalAbbreviation,omitempty"` Language string `json:"language,omitempty"` NumberOfPages string `json:"number-of-pages,omitempty"` -- cgit v1.2.3 From e038bfb6be24994ae9972d08f85c1cb1506a06b4 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 12:46:00 -0700 Subject: skate: use date-parts for year, not 'raw' --- skate/reduce.go | 13 +++++++------ skate/schema_test.go | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index 356ed25..4cd604a 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -574,7 +574,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ var ( authors []CSLAuthor isbn string - year string + issued *CSLDate ) for _, name := range r.Biblio.ContribRawNames { authors = append(authors, CSLAuthor{Name: name}) @@ -582,8 +582,11 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ if len(r.Biblio.Extra.ISBN) > 0 { isbn = r.Biblio.Extra.ISBN[0] } - if r.Biblio.Year > 1500 && r.Biblio.Year < 2022 { - year = fmt.Sprintf("%d", r.Biblio.Year) + // TODO: need to update this "max year" number frequently? + if r.Biblio.Year > 1500 && r.Biblio.Year <= 2025 { + issued = &CSLDate{Parts: [][]int{{int(r.Biblio.Year)}}} + } else { + issued = &CSLDate{} } bref.TargetCSL = &CSL{ Author: authors, @@ -598,9 +601,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ Title: r.Biblio.Title, URL: r.Biblio.Url, Volume: r.Biblio.Volume, - Issued: &CSLDate{ - Raw: year, - }, + Issued: issued, } } // Reuse fields for debugging, for now. diff --git a/skate/schema_test.go b/skate/schema_test.go index 7616001..4489bed 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -263,10 +263,10 @@ func TestSchemaMarshal(t *testing.T) { var csl = CSL{ Title: "test-doc", Issued: &CSLDate{ - Raw: "2012", + Parts: [][]int{{2012}}, }, } - var csl_json = []byte(`{"issued":{"raw":"2012"},"title":"test-doc"}`) + var csl_json = []byte(`{"issued":{"date-parts":[[2012]]},"title":"test-doc"}`) var csl_encoded, _ = json.Marshal(csl) if bytes.Compare(csl_json, csl_encoded) != 0 { t.Fatalf("got:\n%v\nwant:\n%v\n", string(csl_json[:]), string(csl_encoded[:])) -- cgit v1.2.3 From a2cae69110df162b12b4a4aad33785fa35f6e5e8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 13:01:38 -0700 Subject: schema: switch from '.name' to '.raw_name' for un-parsed CSL name field --- skate/reduce.go | 4 ++-- skate/schema.go | 4 ++-- skate/schema_test.go | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index 4cd604a..361d7ba 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -576,8 +576,8 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ isbn string issued *CSLDate ) - for _, name := range r.Biblio.ContribRawNames { - authors = append(authors, CSLAuthor{Name: name}) + for _, raw_name := range r.Biblio.ContribRawNames { + authors = append(authors, CSLAuthor{RawName: raw_name}) } if len(r.Biblio.Extra.ISBN) > 0 { isbn = r.Biblio.Extra.ISBN[0] diff --git a/skate/schema.go b/skate/schema.go index 50f52d6..f36815f 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -481,12 +481,12 @@ type CSLDate struct { // family element. Institutional names may be delivered in the same way, but it // is preferred to set them instead as a literal element. // -// We include Name, for holding unparsed name, which is not a literal. +// We include RawName, for holding unparsed name, which is not a literal. type CSLAuthor struct { Family string `json:"family,omitempty"` Given string `json:"given,omitempty"` Literal string `json:"literal,omitempty"` - Name string `json:"name,omitempty"` + RawName string `json:"raw_name,omitempty"` } func (b *BiblioRef) Reset() { diff --git a/skate/schema_test.go b/skate/schema_test.go index 4489bed..59b1f58 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -221,11 +221,11 @@ func TestLinkHash(t *testing.T) { }{ { bref: BiblioRef{}, - linkHash: "7cae9fc61f167bc26cc3839f15457fe87b2be4e1", + linkHash: "8b8c3f74dd1472aa8869ee3a58295b70c7064aa8", }, { bref: BiblioRef{SourceReleaseIdent: "123"}, - linkHash: "a0969f96c14cb42d298117e1927bd409873173a2", + linkHash: "23d0f9e279ec533f46a6b220f7a5758ec0c9d9af", }, { bref: BiblioRef{ -- cgit v1.2.3 From 3d6ea95540d1e5b225b2507808d5318ecea05b6b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 15:38:31 -0700 Subject: skate: pass-through match_provenance in more situations --- skate/reduce.go | 2 ++ 1 file changed, 2 insertions(+) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index 361d7ba..76b511e 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -408,6 +408,7 @@ func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { cdx.Summary.Ok, cdx.Line) } } + bref.MatchProvenance = ref.RefSource bref.MatchStatus = StatusExact.Short() bref.MatchReason = ReasonURLMatch.Short() if err := enc.Encode(bref); err != nil { @@ -605,6 +606,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ } } // Reuse fields for debugging, for now. + bref.MatchProvenance = r.RefSource bref.MatchStatus = StatusUnmatched.Short() bref.MatchReason = ReasonUnknown.Short() matched = append(matched, &bref) -- cgit v1.2.3 From 1c1c2cb5eb983ae26a8a445aee081b147cf0f652 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 15:54:38 -0700 Subject: skate unstructured: don't parse DOI out of key DOIs in keys, usually from Crossref, are the DOI of the *source* of the reference, not the *target* of the reference. Thus, they should not be parsed and copied to the ref.biblio.doi field. --- skate/unstructured.go | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'skate') diff --git a/skate/unstructured.go b/skate/unstructured.go index f2c1d21..39821a1 100644 --- a/skate/unstructured.go +++ b/skate/unstructured.go @@ -24,33 +24,17 @@ func ParseUnstructured(ref *Ref) error { v string vs []string ) - // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, - // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... - if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { - parts := strings.Split(strings.ToLower(ref.Key), "-bib") - ref.Biblio.DOI = parts[0] - } // DOI v = PatDOI.FindString(uns) if v != "" && ref.Biblio.DOI == "" { ref.Biblio.DOI = v } - // DOI in Key - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } // DOI in URL for _, prefix := range DOILinkPrefixes { if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) } } - // Another DOI pattern. - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } // Arxiv vs = PatArxiv.FindStringSubmatch(uns) if len(vs) != 0 && ref.Biblio.ArxivId == "" { -- cgit v1.2.3 From 9df1ff863eb9729faa9b46effb460c74203969f7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 16:29:22 -0700 Subject: skate: fast SanitizeDOI helper for normalizing DOIs --- skate/doi.go | 39 +++++++++++++++++++++++++++++++++++++++ skate/doi_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 skate/doi.go create mode 100644 skate/doi_test.go (limited to 'skate') diff --git a/skate/doi.go b/skate/doi.go new file mode 100644 index 0000000..8f6049e --- /dev/null +++ b/skate/doi.go @@ -0,0 +1,39 @@ +package skate + +import ( + "strings" +) + +// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a +// re-implementation of the simple 'clean_doi()' python function. +// It should handle DOI URLs, prefixes, and some forms of mangling, though it +// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled +// suffixes). +// At least lower-cases all DOIs, for more permissive matching. +// Does not validate or convert non-ASCII characters. +// Intended to be performant and used liberally; does not execute any regexes. +// Returns empty string if the input is definitely not a DOI, though is +// relatively permissive and does little validation. +func SanitizeDOI(raw string) string { + // short-circuits + if len(raw) < 8 || !strings.Contains(raw, "10.") { + return "" + } + + // lower-case and trim whitespace + raw = strings.ToLower(strings.TrimSpace(raw)) + + // if doesn't start with 10., strip any prefix + start := strings.Index(raw, "10.") + if start == -1 { + return "" + } else if start > 0 { + raw = raw[start:len(raw)] + } + + // final simple checks + if len(raw) < 8 || !strings.Contains(raw, "/") { + return "" + } + return raw +} diff --git a/skate/doi_test.go b/skate/doi_test.go new file mode 100644 index 0000000..7a184d3 --- /dev/null +++ b/skate/doi_test.go @@ -0,0 +1,32 @@ +package skate + +import "testing" + +func TestSanitizeDOI(t *testing.T) { + var cases = []struct { + in string + out string + }{ + {"", ""}, + {"a", ""}, + {"???", ""}, + {"10.1234", ""}, + {"10.1234/asdf ", "10.1234/asdf"}, + {"10.1234/ASDF", "10.1234/asdf"}, + {"10.1037/0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + {"http://doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"http://doi.org/10.123", ""}, + {"dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"21924DOI10.1234/asdf ", "10.1234/asdf"}, + {"https://dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"doi:10.1234/asdf ", "10.1234/asdf"}, + {"10.7326/M20-6817", "10.7326/m20-6817"}, + // TODO: {"10.1037//0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + } + for _, c := range cases { + out := SanitizeDOI(c.in) + if out != c.out { + t.Fatalf("got %v, want %v", out, c.out) + } + } +} -- cgit v1.2.3 From 0d4c3ca311b1057bdb07144b0ac8ba860be2de55 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 25 Jul 2021 16:36:52 -0700 Subject: skate: use SanitizeDOI in all inputs --- skate/cmd/skate-wikipedia-doi/main.go | 2 +- skate/schema.go | 4 ++-- skate/unstructured.go | 21 ++++----------------- skate/unstructured_test.go | 4 ++-- 4 files changed, 9 insertions(+), 22 deletions(-) (limited to 'skate') diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index a6d82c0..3f7afde 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -39,7 +39,7 @@ func main() { return nil, nil } var ( - doi = wsReplacer.Replace(match[0]) + doi = skate.SanitizeDOI(wsReplacer.Replace(match[0])) pageTitle = strings.TrimSpace(w.PageTitle) s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p)) ) diff --git a/skate/schema.go b/skate/schema.go index f36815f..d6b4ded 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -94,7 +94,7 @@ func RefToRelease(ref *Ref) (*Release, error) { release.Ident = ref.ReleaseIdent release.WorkID = ref.WorkIdent release.ExtIDs.Arxiv = b.ArxivId - release.ExtIDs.DOI = b.DOI + release.ExtIDs.DOI = SanitizeDOI(b.DOI) release.ExtIDs.PMID = b.PMID release.ExtIDs.PMCID = b.PMCID release.Title = b.Title @@ -616,7 +616,7 @@ func (c *MinimalCitations) ParseIDList() (result IDList) { case "ISBN": result.ISBN = pair[1] case "DOI": - result.DOI = pair[1] + result.DOI = SanitizeDOI(pair[1]) case "PMID": result.PMID = pair[1] case "ISSN": diff --git a/skate/unstructured.go b/skate/unstructured.go index 39821a1..a172e8b 100644 --- a/skate/unstructured.go +++ b/skate/unstructured.go @@ -2,19 +2,12 @@ package skate import ( "regexp" - "strings" ) var ( - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) - PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - DOILinkPrefixes = []string{ - "http://doi.org/", - "http://dx.doi.org/", - "https://doi.org/", - "https://dx.doi.org/", - } + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) + PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) ) // ParseUnstructured will in-place augment missing DOI, arxiv id and so on. @@ -27,13 +20,7 @@ func ParseUnstructured(ref *Ref) error { // DOI v = PatDOI.FindString(uns) if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in URL - for _, prefix := range DOILinkPrefixes { - if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { - ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) - } + ref.Biblio.DOI = SanitizeDOI(v) } // Arxiv vs = PatArxiv.FindStringSubmatch(uns) diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go index 92f1d80..1727430 100644 --- a/skate/unstructured_test.go +++ b/skate/unstructured_test.go @@ -20,7 +20,7 @@ func TestParseUnstructured(t *testing.T) { }, &Ref{ Biblio: Biblio{ - DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5", Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", }, }, @@ -35,7 +35,7 @@ func TestParseUnstructured(t *testing.T) { &Ref{ Biblio: Biblio{ ArxivId: "0808.3320", - DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5", Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", }, }, -- cgit v1.2.3