diff options
author | Martin Czygan <martin@archive.org> | 2021-07-26 17:43:04 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-07-26 17:43:04 +0000 |
commit | aeaa60211e33cb49da98770b3461cbca2c2a65cc (patch) | |
tree | 1def8dfcd4d2c035a8b5ee6d88507a1ad53a8b40 /skate | |
parent | befd7895262e2469367e2a4f71f78148b9986dee (diff) | |
parent | 0d4c3ca311b1057bdb07144b0ac8ba860be2de55 (diff) | |
download | refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.tar.gz refcat-aeaa60211e33cb49da98770b3461cbca2c2a65cc.zip |
Merge branch 'bnewbold-skate-tweaks' into 'master'
proposed changes and fixes to skate matching
See merge request martin/cgraph!3
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-wikipedia-doi/main.go | 2 | ||||
-rw-r--r-- | skate/doi.go | 39 | ||||
-rw-r--r-- | skate/doi_test.go | 32 | ||||
-rw-r--r-- | skate/reduce.go | 19 | ||||
-rw-r--r-- | skate/reduce_test.go | 4 | ||||
-rw-r--r-- | skate/schema.go | 12 | ||||
-rw-r--r-- | skate/schema_test.go | 21 | ||||
-rw-r--r-- | skate/unstructured.go | 37 | ||||
-rw-r--r-- | skate/unstructured_test.go | 4 |
9 files changed, 116 insertions, 54 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index a6d82c0..3f7afde 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -39,7 +39,7 @@ func main() { return nil, nil } var ( - doi = wsReplacer.Replace(match[0]) + doi = skate.SanitizeDOI(wsReplacer.Replace(match[0])) pageTitle = strings.TrimSpace(w.PageTitle) s = fmt.Sprintf("%s\t%s\t%s", doi, pageTitle, string(p)) ) diff --git a/skate/doi.go b/skate/doi.go new file mode 100644 index 0000000..8f6049e --- /dev/null +++ b/skate/doi.go @@ -0,0 +1,39 @@ +package skate + +import ( + "strings" +) + +// SanitizeDOI will both normalize and verify a raw DOI string. It is roughly a +// re-implementation of the simple 'clean_doi()' python function. +// It should handle DOI URLs, prefixes, and some forms of mangling, though it +// does not (yet) handle some specific OCR or ML parsing errors (eg, common mangled +// suffixes). +// At least lower-cases all DOIs, for more permissive matching. +// Does not validate or convert non-ASCII characters. +// Intended to be performant and used liberally; does not execute any regexes. +// Returns empty string if the input is definitely not a DOI, though is +// relatively permissive and does little validation. +func SanitizeDOI(raw string) string { + // short-circuits + if len(raw) < 8 || !strings.Contains(raw, "10.") { + return "" + } + + // lower-case and trim whitespace + raw = strings.ToLower(strings.TrimSpace(raw)) + + // if doesn't start with 10., strip any prefix + start := strings.Index(raw, "10.") + if start == -1 { + return "" + } else if start > 0 { + raw = raw[start:len(raw)] + } + + // final simple checks + if len(raw) < 8 || !strings.Contains(raw, "/") { + return "" + } + return raw +} diff --git a/skate/doi_test.go b/skate/doi_test.go new file mode 100644 index 0000000..7a184d3 --- /dev/null +++ b/skate/doi_test.go @@ -0,0 +1,32 @@ +package skate + +import "testing" + +func TestSanitizeDOI(t *testing.T) { + var cases = []struct { + in string + out string + }{ + {"", ""}, + {"a", ""}, + {"???", ""}, + {"10.1234", ""}, + {"10.1234/asdf ", "10.1234/asdf"}, + {"10.1234/ASDF", "10.1234/asdf"}, + {"10.1037/0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + {"http://doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"http://doi.org/10.123", ""}, + {"dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"21924DOI10.1234/asdf ", "10.1234/asdf"}, + {"https://dx.doi.org/10.1234/asdf ", "10.1234/asdf"}, + {"doi:10.1234/asdf ", "10.1234/asdf"}, + {"10.7326/M20-6817", "10.7326/m20-6817"}, + // TODO: {"10.1037//0002-9432.72.1.50", "10.1037/0002-9432.72.1.50"}, + } + for _, c := range cases { + out := SanitizeDOI(c.in) + if out != c.out { + t.Fatalf("got %v, want %v", out, c.out) + } + } +} diff --git a/skate/reduce.go b/skate/reduce.go index e2fa130..76b511e 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -408,6 +408,7 @@ func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { cdx.Summary.Ok, cdx.Line) } } + bref.MatchProvenance = ref.RefSource bref.MatchStatus = StatusExact.Short() bref.MatchReason = ReasonURLMatch.Short() if err := enc.Encode(bref); err != nil { @@ -574,16 +575,19 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ var ( authors []CSLAuthor isbn string - year string + issued *CSLDate ) - for _, name := range r.Biblio.ContribRawNames { - authors = append(authors, CSLAuthor{Name: name}) + for _, raw_name := range r.Biblio.ContribRawNames { + authors = append(authors, CSLAuthor{RawName: raw_name}) } if len(r.Biblio.Extra.ISBN) > 0 { isbn = r.Biblio.Extra.ISBN[0] } - if r.Biblio.Year > 1500 && r.Biblio.Year < 2022 { - year = fmt.Sprintf("%d", r.Biblio.Year) + // TODO: need to update this "max year" number frequently? + if r.Biblio.Year > 1500 && r.Biblio.Year <= 2025 { + issued = &CSLDate{Parts: [][]int{{int(r.Biblio.Year)}}} + } else { + issued = &CSLDate{} } bref.TargetCSL = &CSL{ Author: authors, @@ -598,12 +602,11 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) [ Title: r.Biblio.Title, URL: r.Biblio.Url, Volume: r.Biblio.Volume, - Issued: CSLDate{ - Raw: year, - }, + Issued: issued, } } // Reuse fields for debugging, for now. + bref.MatchProvenance = r.RefSource bref.MatchStatus = StatusUnmatched.Short() bref.MatchReason = ReasonUnknown.Short() matched = append(matched, &bref) diff --git a/skate/reduce_test.go b/skate/reduce_test.go index 9c134f8..7cde68f 100644 --- a/skate/reduce_test.go +++ b/skate/reduce_test.go @@ -199,7 +199,7 @@ func TestMatchedRefsExtend(t *testing.T) { MatchReason: ReasonUnknown.Short(), SourceYear: "0", TargetCSL: &CSL{ - Accessed: CSLDate{}, + Accessed: nil, Author: nil, CollectionTitle: "", ContainerTitle: "", @@ -209,7 +209,7 @@ func TestMatchedRefsExtend(t *testing.T) { ISBN: "", ISSN: "", Issue: "", - Issued: CSLDate{}, + Issued: &CSLDate{}, JournalAbbreviation: "", Language: "", NumberOfPages: "", diff --git a/skate/schema.go b/skate/schema.go index 93c9680..d6b4ded 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -94,7 +94,7 @@ func RefToRelease(ref *Ref) (*Release, error) { release.Ident = ref.ReleaseIdent release.WorkID = ref.WorkIdent release.ExtIDs.Arxiv = b.ArxivId - release.ExtIDs.DOI = b.DOI + release.ExtIDs.DOI = SanitizeDOI(b.DOI) release.ExtIDs.PMID = b.PMID release.ExtIDs.PMCID = b.PMCID release.Title = b.Title @@ -431,7 +431,7 @@ type BiblioRef struct { // https://github.com/citation-style-language/schema, // https://navneethg.github.io/jsonschemaviewer/. This is a subset only. type CSL struct { - Accessed CSLDate `json:"accessed,omitempty"` + Accessed *CSLDate `json:"accessed,omitempty"` Author []CSLAuthor `json:"author,omitempty"` CollectionTitle string `json:"collection-title,omitempty"` ContainerTitle string `json:"container-title,omitempty"` @@ -441,7 +441,7 @@ type CSL struct { ISBN string `json:"ISBN,omitempty"` ISSN string `json:"ISSN,omitempty"` Issue string `json:"issue,omitempty"` - Issued CSLDate `json:"issued,omitempty"` + Issued *CSLDate `json:"issued,omitempty"` JournalAbbreviation string `json:"journalAbbreviation,omitempty"` Language string `json:"language,omitempty"` NumberOfPages string `json:"number-of-pages,omitempty"` @@ -481,12 +481,12 @@ type CSLDate struct { // family element. Institutional names may be delivered in the same way, but it // is preferred to set them instead as a literal element. // -// We include Name, for holding unparsed name, which is not a literal. +// We include RawName, for holding unparsed name, which is not a literal. type CSLAuthor struct { Family string `json:"family,omitempty"` Given string `json:"given,omitempty"` Literal string `json:"literal,omitempty"` - Name string `json:"name,omitempty"` + RawName string `json:"raw_name,omitempty"` } func (b *BiblioRef) Reset() { @@ -616,7 +616,7 @@ func (c *MinimalCitations) ParseIDList() (result IDList) { case "ISBN": result.ISBN = pair[1] case "DOI": - result.DOI = pair[1] + result.DOI = SanitizeDOI(pair[1]) case "PMID": result.PMID = pair[1] case "ISSN": diff --git a/skate/schema_test.go b/skate/schema_test.go index 3267072..59b1f58 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -1,6 +1,7 @@ package skate import ( + "bytes" "encoding/json" "fmt" "reflect" @@ -220,11 +221,11 @@ func TestLinkHash(t *testing.T) { }{ { bref: BiblioRef{}, - linkHash: "7cae9fc61f167bc26cc3839f15457fe87b2be4e1", + linkHash: "8b8c3f74dd1472aa8869ee3a58295b70c7064aa8", }, { bref: BiblioRef{SourceReleaseIdent: "123"}, - linkHash: "a0969f96c14cb42d298117e1927bd409873173a2", + linkHash: "23d0f9e279ec533f46a6b220f7a5758ec0c9d9af", }, { bref: BiblioRef{ @@ -256,6 +257,22 @@ func TestLinkHash(t *testing.T) { } } +func TestSchemaMarshal(t *testing.T) { + + // CSL when issued is empty, but accepted is not + var csl = CSL{ + Title: "test-doc", + Issued: &CSLDate{ + Parts: [][]int{{2012}}, + }, + } + var csl_json = []byte(`{"issued":{"date-parts":[[2012]]},"title":"test-doc"}`) + var csl_encoded, _ = json.Marshal(csl) + if bytes.Compare(csl_json, csl_encoded) != 0 { + t.Fatalf("got:\n%v\nwant:\n%v\n", string(csl_json[:]), string(csl_encoded[:])) + } +} + func TestReleaseToUnstructured(t *testing.T) { var cases = []struct { r *Release diff --git a/skate/unstructured.go b/skate/unstructured.go index f2c1d21..a172e8b 100644 --- a/skate/unstructured.go +++ b/skate/unstructured.go @@ -2,19 +2,12 @@ package skate import ( "regexp" - "strings" ) var ( - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) - PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - DOILinkPrefixes = []string{ - "http://doi.org/", - "http://dx.doi.org/", - "https://doi.org/", - "https://dx.doi.org/", - } + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) + PatArxiv = regexp.MustCompile(`https?://arxiv.org/(pdf|abs)/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) ) // ParseUnstructured will in-place augment missing DOI, arxiv id and so on. @@ -24,32 +17,10 @@ func ParseUnstructured(ref *Ref) error { v string vs []string ) - // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, - // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... - if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { - parts := strings.Split(strings.ToLower(ref.Key), "-bib") - ref.Biblio.DOI = parts[0] - } // DOI v = PatDOI.FindString(uns) if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in Key - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in URL - for _, prefix := range DOILinkPrefixes { - if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { - ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) - } - } - // Another DOI pattern. - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v + ref.Biblio.DOI = SanitizeDOI(v) } // Arxiv vs = PatArxiv.FindStringSubmatch(uns) diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go index 92f1d80..1727430 100644 --- a/skate/unstructured_test.go +++ b/skate/unstructured_test.go @@ -20,7 +20,7 @@ func TestParseUnstructured(t *testing.T) { }, &Ref{ Biblio: Biblio{ - DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5", Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", }, }, @@ -35,7 +35,7 @@ func TestParseUnstructured(t *testing.T) { &Ref{ Biblio: Biblio{ ArxivId: "0808.3320", - DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-bib5", Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", }, }, |