// TODO: The various grouping and verification functions should probably be in // a separate file and it should be obvious how to adjust or write a new one. //go:generate stringer -type=Status,Reason -output verify_string.go verify.go package skate import ( "bytes" "fmt" "io" "regexp" "strconv" "strings" "unicode/utf8" "git.archive.org/martin/cgraph/skate/set" "github.com/segmentio/encoding/json" ) // This file contains a port of fuzzycat.verify to Go. type ( // Status represents match strength. Status int // Reason gives more context to status result. Reason int ) const ( StatusUnknown Status = iota StatusExact StatusStrong StatusWeak StatusDifferent StatusAmbiguous StatusUnmatched ReasonAppendix Reason = iota ReasonArxiv ReasonArxivVersion ReasonBlacklisted ReasonBlacklistedFragment ReasonBookChapter ReasonChemFormula ReasonComponent ReasonContainer ReasonContainerNameBlacklist ReasonContribIntersectionEmpty ReasonCustomBSISubdoc ReasonCustomBSIUndated ReasonCustomIEEEArxiv ReasonCustomIOPMAPattern ReasonCustomPrefix1014288 ReasonCustomPrefix105860ChoiceReview ReasonCustomPrefix107916 ReasonCustomVHS ReasonDOI ReasonDataciteRelatedID ReasonDataciteVersion ReasonDatasetDOI ReasonFigshareVersion ReasonISBN ReasonJaccardAuthors ReasonJstorID ReasonMaxClusterSizeExceeded ReasonNumDiff ReasonPMCID ReasonPMID ReasonPMIDDOIPair ReasonPageCount ReasonPreprintPublished ReasonPublisherBlacklist ReasonReleaseType ReasonSharedDOIPrefix ReasonShortTitle ReasonSingularCluster ReasonSlugTitleAuthorMatch ReasonSubtitle ReasonTitleArtifact ReasonTitleAuthorMatch ReasonTitleFilename ReasonTokenizedAuthors ReasonURLMatch ReasonUnknown ReasonVersionedDOI ReasonWorkID ReasonYear ) // Short name. func (s Status) Short() string { return strings.ToLower(strings.Replace(s.String(), "Status", "", 1)) } // Short name. func (r Reason) Short() string { return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1)) } // MatchResult is the result of a verification. type MatchResult struct { Status Status Reason Reason } // VerificationPair groups two identifiers and their match status and // match reason. type MatchPair struct { A string B string Result MatchResult } // AsLine returns a TSV line of the match pair. func (m *MatchPair) AsLine() string { return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason) } var ( PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`) PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`) PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`) PatDigits = regexp.MustCompile(`\d+`) PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) ) // JsonMarshalNewline marshals a value as JSON and adds a newline. func JsonMarshalNewline(v interface{}) ([]byte, error) { b, err := json.Marshal(v) if err != nil { return nil, err } b = append(b, []byte("\n")...) return b, nil } // ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a // port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification. func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { var ( rc *ReleaseCluster buf bytes.Buffer n int ) if err := json.Unmarshal(p, &rc); err != nil { return nil, err } if n = len(rc.Values); n > maxClusterSize { return nil, nil } // O(n^2) ahead, specifically, n * (n-1) / 2. for i := 0; i < n; i++ { for j := i; j < n; j++ { if i == j { continue } a := rc.Values[i] b := rc.Values[j] matchPair := &MatchPair{ A: a.Ident, B: b.Ident, Result: Verify(a, b), } if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { return nil, err } } } return buf.Bytes(), nil } // ClusterVerify runs verification process across all pairs, but skips clusters // containing more than ten elements. If a cluster has more then 10 elements, // it might also signal a too ambiguous title. Beside, we do not want this to // be too slow. func ClusterVerify(p []byte) ([]byte, error) { return ClusterVerifyMaxClusterSize(p, 10) } // RefClusterVerify deserializes a cluster document containing both converted // references and releases and returns a tabular verification result between // one (any) release and all references found. This depends on refs and releases // being distinguishable, (e.g. via .extra.skate.status == "ref"). func RefClusterVerify(p []byte) ([]byte, error) { var ( rc *ReleaseCluster buf bytes.Buffer pivot, re *Release err error ) if err = json.Unmarshal(p, &rc); err != nil { return nil, err } if pivot, err = rc.OneNonRef(); err != nil { return nil, err } for _, re = range rc.Values { if re.Extra.Skate.Status != "ref" { continue } matchPair := &MatchPair{ A: pivot.Ident, B: re.Ident, Result: Verify(pivot, re), } if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { return nil, err } } return buf.Bytes(), nil } // RefClusterToBiblioRef runs verification and creates a BiblioRef schema from // exact and strong matches only. func RefClusterToBiblioRef(p []byte) ([]byte, error) { var ( rc *ReleaseCluster br *BiblioRef buf bytes.Buffer pivot, re *Release err error ) if err = json.Unmarshal(p, &rc); err != nil { return nil, err } if pivot, err = rc.OneNonRef(); err != nil { return nil, err } for _, re = range rc.Values { if re.Extra.Skate.Status != "ref" { continue } result := Verify(pivot, re) switch result.Status { case StatusExact, StatusStrong: if result.Reason == ReasonDOI { continue // Assume we already have the DOI matches. } br = generateBiblioRef(re, pivot, result, "fuzzy") return JsonMarshalNewline(br) default: // XXX: may want to include non matches here. continue } } return buf.Bytes(), nil } // generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { var bref BiblioRef bref.SourceReleaseIdent = source.Ident bref.SourceWorkIdent = source.WorkID bref.SourceReleaseStage = source.ReleaseStage if source.ReleaseYear() > 1000 { bref.SourceYear = source.ReleaseYearString() } bref.RefIndex = source.Extra.Skate.Ref.Index bref.RefKey = source.Extra.Skate.Ref.Key bref.TargetReleaseIdent = target.Ident bref.TargetWorkIdent = target.WorkID bref.MatchProvenance = provenance bref.MatchStatus = matchResult.Status.Short() bref.MatchReason = matchResult.Reason.Short() return &bref } // Verify verifies two releases and will ignore short titles. func Verify(a, b *Release) MatchResult { return VerifyMinTitleLength(a, b, 5) } // Verify follows the fuzzycat (Python) implementation of this function: it // compares two release entities. The Go version can be used for large batch // processing (where the Python version might take two or more days). func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult { if a.ExtIDs.DOI != "" && a.ExtIDs.DOI == b.ExtIDs.DOI { return MatchResult{StatusExact, ReasonDOI} } if a.WorkID != "" && a.WorkID == b.WorkID { return MatchResult{StatusExact, ReasonWorkID} } aTitleLower := strings.ToLower(a.Title) bTitleLower := strings.ToLower(b.Title) if utf8.RuneCountInString(a.Title) < minTitleLength { return MatchResult{StatusAmbiguous, ReasonShortTitle} } if BlacklistTitle.Contains(aTitleLower) { return MatchResult{StatusAmbiguous, ReasonBlacklisted} } if BlacklistTitle.Contains(bTitleLower) { return MatchResult{StatusAmbiguous, ReasonBlacklisted} } for _, fragment := range BlacklistTitleFragments.Slice() { if strings.Contains(aTitleLower, fragment) { return MatchResult{StatusAmbiguous, ReasonBlacklistedFragment} } } if strings.Contains(aTitleLower, "subject index") && strings.Contains(bTitleLower, "subject index") { if a.ContainerID != "" && a.ContainerID != b.ContainerID { return MatchResult{StatusDifferent, ReasonContainer} } } if a.Title != "" && a.Title == b.Title && a.Extra.DataCite.MetadataVersion > 0 && b.Extra.DataCite.MetadataVersion > 0 && a.Extra.DataCite.MetadataVersion != b.Extra.DataCite.MetadataVersion { return MatchResult{StatusExact, ReasonDataciteVersion} } if strings.HasPrefix(a.ExtIDs.DOI, "10.14288/") && strings.HasPrefix(b.ExtIDs.DOI, "10.14288/") && a.ExtIDs.DOI != b.ExtIDs.DOI { return MatchResult{StatusDifferent, ReasonCustomPrefix1014288} } if strings.HasPrefix(a.ExtIDs.DOI, "10.3403") && strings.HasPrefix(b.ExtIDs.DOI, "10.3403") { if a.ExtIDs.DOI+"u" == b.ExtIDs.DOI || b.ExtIDs.DOI+"u" == a.ExtIDs.DOI { return MatchResult{StatusStrong, ReasonCustomBSIUndated} } aSubtitle := a.Subtitle() bSubtitle := b.Subtitle() if a.Title != "" && a.Title == b.Title && ((len(aSubtitle) > 0 && aSubtitle[0] != "" && len(bSubtitle) == 0) || (len(aSubtitle) == 0 && len(bSubtitle) > 0 && bSubtitle[0] != "")) { return MatchResult{StatusStrong, ReasonCustomBSISubdoc} } } if strings.HasPrefix(a.ExtIDs.DOI, "10.1149") && strings.HasPrefix(b.ExtIDs.DOI, "10.1149") { v := "10.1149/ma" if (strings.HasPrefix(a.ExtIDs.DOI, v) && !strings.HasPrefix(b.ExtIDs.DOI, v)) || (!strings.HasPrefix(a.ExtIDs.DOI, v) && strings.HasPrefix(b.ExtIDs.DOI, v)) { return MatchResult{StatusDifferent, ReasonCustomIOPMAPattern} } } if strings.Contains(a.Title, "Zweckverband Volkshochschule") && a.Title != b.Title { return MatchResult{StatusDifferent, ReasonCustomVHS} } if PatAppendix.MatchString(a.Title) { return MatchResult{StatusAmbiguous, ReasonAppendix} } if strings.HasPrefix(a.ExtIDs.DOI, "10.6084/") && strings.HasPrefix(b.ExtIDs.DOI, "10.6084/") { av := PatFigshareVersion.ReplaceAllString(a.ExtIDs.DOI, "") bv := PatFigshareVersion.ReplaceAllString(b.ExtIDs.DOI, "") if av == bv { return MatchResult{StatusStrong, ReasonFigshareVersion} } } if PatVersionedDOI.MatchString(a.ExtIDs.DOI) && PatVersionedDOI.MatchString(b.ExtIDs.DOI) { return MatchResult{StatusStrong, ReasonVersionedDOI} } if looksLikeComponent(a.ExtIDs.DOI, b.ExtIDs.DOI) { return MatchResult{StatusStrong, ReasonVersionedDOI} } if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 { getRelatedDOI := func(rel *Release) set.Set { ss := set.New() for _, rel := range rel.Extra.DataCite.Relations { if strings.ToLower(rel.RelatedIdentifierType) != "doi" { continue } ss.Add(rel.RelatedIdentifier()) } return ss } aRelated := getRelatedDOI(a) bRelated := getRelatedDOI(b) if aRelated.Contains(b.ExtIDs.DOI) || bRelated.Contains(a.ExtIDs.DOI) { return MatchResult{StatusStrong, ReasonDataciteRelatedID} } } if a.ExtIDs.Arxiv != "" && b.ExtIDs.Arxiv != "" { aSub := PatArxivVersion.FindStringSubmatch(a.ExtIDs.Arxiv) bSub := PatArxivVersion.FindStringSubmatch(b.ExtIDs.Arxiv) if len(aSub) == 2 && len(bSub) == 2 && aSub[1] == bSub[1] { return MatchResult{StatusStrong, ReasonArxivVersion} } } if a.ReleaseType != b.ReleaseType { types := set.FromSlice([]string{a.ReleaseType, b.ReleaseType}) ignoreTypes := set.FromSlice([]string{"article", "article-journal", "report", "paper-conference"}) if types.Intersection(ignoreTypes).IsEmpty() { return MatchResult{StatusDifferent, ReasonReleaseType} } if types.Contains("dataset") && (types.Contains("article") || types.Contains("article-journal")) { return MatchResult{StatusDifferent, ReasonReleaseType} } if types.Contains("book") && (types.Contains("article") || types.Contains("article-journal")) { return MatchResult{StatusDifferent, ReasonReleaseType} } } if a.ReleaseType == "dataset" && b.ReleaseType == "dataset" && a.ExtIDs.DOI != b.ExtIDs.DOI { return MatchResult{StatusDifferent, ReasonDatasetDOI} } if a.ReleaseType == "chapter" && b.ReleaseType == "chapter" && a.Extra.ContainerName != "" && a.Extra.ContainerName != b.Extra.ContainerName { return MatchResult{StatusDifferent, ReasonBookChapter} } if a.Extra.Crossref.Type == "component" && a.Title != b.Title { return MatchResult{StatusDifferent, ReasonComponent} } if a.ReleaseType == "component" && b.ReleaseType == "component" { if a.ExtIDs.DOI != "" && a.ExtIDs.DOI != b.ExtIDs.DOI { return MatchResult{StatusDifferent, ReasonComponent} } } aSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(a.Title), "\n", " ", -1)) bSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(b.Title), "\n", " ", -1)) if aSlugTitle == bSlugTitle { if a.ReleaseYear() != 0 && b.ReleaseYear() != 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 40 { return MatchResult{StatusDifferent, ReasonYear} } } if aSlugTitle == bSlugTitle { ieeeArxivCheck := func(a, b *Release) (ok bool) { return doiPrefix(a.ExtIDs.DOI) == "10.1109" && b.ExtIDs.Arxiv != "" } if ieeeArxivCheck(a, b) || ieeeArxivCheck(b, a) { return MatchResult{StatusStrong, ReasonCustomIEEEArxiv} } } if aSlugTitle == bSlugTitle { if strings.HasPrefix(a.ExtIDs.DOI, "10.7916/") && strings.HasPrefix(b.ExtIDs.DOI, "10.7916/") { return MatchResult{StatusAmbiguous, ReasonCustomPrefix107916} } } if aSlugTitle == bSlugTitle { aSubtitle := a.Subtitle() bSubtitle := b.Subtitle() for _, aSub := range aSubtitle { for _, bSub := range bSubtitle { if slugifyString(aSub) != slugifyString(bSub) { return MatchResult{StatusDifferent, ReasonSubtitle} } } } } rawAuthors := func(rel *Release) (names []string) { for _, c := range rel.Contribs { name := strings.TrimSpace(c.RawName) if name == "" { continue } names = append(names, name) } return names } aAuthors := set.FromSlice(rawAuthors(a)) bAuthors := set.FromSlice(rawAuthors(b)) aSlugAuthors := set.FromSlice(mapString(slugifyString, aAuthors.Slice())) bSlugAuthors := set.FromSlice(mapString(slugifyString, bAuthors.Slice())) if aTitleLower == bTitleLower { if aAuthors.Len() > 0 && aAuthors.Equals(bAuthors) { if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 { return MatchResult{StatusDifferent, ReasonYear} } return MatchResult{StatusExact, ReasonTitleAuthorMatch} } } if looksLikeFilename(a.Title) || looksLikeFilename(b.Title) { if a.Title != b.Title { return MatchResult{StatusDifferent, ReasonTitleFilename} } } if a.Title != "" && a.Title == b.Title { if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 2 { return MatchResult{StatusDifferent, ReasonYear} } } // XXX: skipping chemical formula detection (to few cases; https://git.io/Jtdax) if len(aSlugTitle) < 10 && aSlugTitle != bSlugTitle { return MatchResult{StatusAmbiguous, ReasonShortTitle} } if PatDigits.MatchString(aSlugTitle) && aSlugTitle != bSlugTitle && unifyDigits(aSlugTitle) == unifyDigits(bSlugTitle) { return MatchResult{StatusDifferent, ReasonNumDiff} } if aSlugTitle != "" && bSlugTitle != "" && strings.ReplaceAll(aSlugTitle, " ", "") == strings.ReplaceAll(bSlugTitle, " ", "") { if aSlugAuthors.Intersection(bSlugAuthors).Len() > 0 { if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 { return MatchResult{StatusDifferent, ReasonYear} } return MatchResult{StatusStrong, ReasonSlugTitleAuthorMatch} } } if a.ReleaseYear() > 0 && a.ReleaseYear() == b.ReleaseYear() && aTitleLower == bTitleLower { if (a.ExtIDs.PMID != "" && b.ExtIDs.DOI != "") || (b.ExtIDs.PMID != "" && a.ExtIDs.DOI != "") { return MatchResult{StatusStrong, ReasonPMIDDOIPair} } } if a.ExtIDs.Jstor != "" && b.ExtIDs.Jstor != "" && a.ExtIDs.Jstor != b.ExtIDs.Jstor { return MatchResult{StatusDifferent, ReasonJstorID} } if a.ContainerID != "" && a.ContainerID == b.ContainerID && a.ExtIDs.DOI != b.ExtIDs.DOI && doiPrefix(a.ExtIDs.DOI) != "10.1126" && doiPrefix(a.ExtIDs.DOI) == doiPrefix(b.ExtIDs.DOI) { return MatchResult{StatusDifferent, ReasonSharedDOIPrefix} } if aAuthors.Len() > 0 && aSlugAuthors.Intersection(bSlugAuthors).IsEmpty() { numAuthors := set.Min(aSlugAuthors, bSlugAuthors) score := averageScore(aSlugAuthors, bSlugAuthors) if (numAuthors < 3 && score > 0.9) || (numAuthors >= 3 && score > 0.5) { return MatchResult{StatusStrong, ReasonTokenizedAuthors} } aTok := set.FromSlice(strings.Fields(aSlugAuthors.Join(" "))) bTok := set.FromSlice(strings.Fields(bSlugAuthors.Join(" "))) aTok = set.Filter(aTok, func(s string) bool { return len(s) > 2 }) bTok = set.Filter(bTok, func(s string) bool { return len(s) > 2 }) if aTok.Len() > 0 && bTok.Len() > 0 { if aTok.Jaccard(bTok) > 0.35 { return MatchResult{StatusStrong, ReasonJaccardAuthors} } } return MatchResult{StatusDifferent, ReasonContribIntersectionEmpty} } if doiPrefix(a.ExtIDs.DOI) == "10.5860" || doiPrefix(b.ExtIDs.DOI) == "10.5860" { return MatchResult{StatusAmbiguous, ReasonCustomPrefix105860ChoiceReview} } // XXX: parse pages aParsedPages := parsePageString(a.Pages) bParsedPages := parsePageString(b.Pages) if aParsedPages.Err != nil && bParsedPages.Err != nil { if absInt(aParsedPages.Count()-bParsedPages.Count()) > 5 { return MatchResult{StatusDifferent, ReasonPageCount} } } if aAuthors.Equals(bAuthors) && a.ContainerID == b.ContainerID && a.ReleaseYear() == b.ReleaseYear() && a.Title != b.Title && (strings.Contains(a.Title, b.Title) || strings.Contains(b.Title, a.Title)) { return MatchResult{StatusStrong, ReasonTitleArtifact} } return MatchResult{ StatusAmbiguous, ReasonUnknown, } } type ParsedPages struct { Start int End int Err error } func (pp *ParsedPages) Count() int { return pp.End - pp.Start + 1 } func parsePageString(s string) *ParsedPages { s = strings.TrimSpace(s) var pp = ParsedPages{} if len(s) == 0 { pp.Err = fmt.Errorf("parse pages: empty string") return &pp } matches := PatPages.FindStringSubmatch(s) if len(matches) != 3 { pp.Err = fmt.Errorf("parse pages: no page pattern") return &pp } start, end := matches[1], matches[2] if len(end) == 1 && len(start) > 1 && start[len(start)-1] < end[0] { end = fmt.Sprintf("%s%c", start[:len(start)-1], end[0]) } if pp.Start, pp.Err = strconv.Atoi(start); pp.Err != nil { return &pp } if pp.End, pp.Err = strconv.Atoi(end); pp.Err != nil { return &pp } if pp.Start > pp.End { pp.Err = fmt.Errorf("invalid page count: %s", s) } return &pp } // averageScore take a limited set of authors and calculates pairwise // similarity scores, then returns the average of the best scores; between 0 // and 1. XXX: This should be revisited and factored out; reading: // https://github.com/djudd/human-name. func averageScore(a, b set.Set) float64 { aTrimmed := a.TopK(5) bTrimmed := b.TopK(5) maxScores := make(map[string]float64) // For each a, keep the max. for _, pair := range aTrimmed.Product(bTrimmed) { a, b := pair[0], pair[1] score := authorSimilarityScore(a, b) if v, ok := maxScores[a]; !ok || score > v { maxScores[a] = score } } var sum, avg float64 for _, v := range maxScores { sum += v } avg = sum / float64(len(maxScores)) return avg } // authorSimilarityScore is a hacky similarity score. func authorSimilarityScore(s, t string) float64 { ss := set.FromSlice(tokenNgrams(s, 2)) ts := set.FromSlice(tokenNgrams(t, 2)) return ss.Jaccard(ts) } // tokenNgrams are groups of n char-tokens per word-token in string, e.g. for // n=2 and string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"]. func tokenNgrams(s string, n int) (result []string) { var buf bytes.Buffer for _, token := range tokenizeString(s) { buf.Reset() for i, c := range token { if i > 0 && i%n == 0 { result = append(result, buf.String()) buf.Reset() } buf.WriteRune(c) // XXX: skipping error handling } result = append(result, buf.String()) } return } func tokenizeString(s string) []string { return strings.Fields(strings.ToLower(s)) } func doiPrefix(s string) string { parts := strings.Split(s, "/") return parts[0] } // unifyDigits replaces all digit groups with a hopefully rare placeholder, // e.g. ""; This is for discovering very similar, yet different // publications, where e.g. titles differ only by a single or few chars // representing a year. Examples are yearly publications, e.g. "World Health // Report 2020", "World Health Report 2021", ... where any plain similarity // score would yield a high number, yet publications are obviously not the // same. func unifyDigits(s string) string { return PatDigits.ReplaceAllString(s, "") } // looksLikeFilename returns true, if the given string could be a filename. func looksLikeFilename(s string) bool { if len(strings.Fields(s)) > 1 { return false } return PatFilenameLike.MatchString(s) } // mapString applies a function on each element of a string slice. func mapString(f func(string) string, vs []string) (result []string) { for _, v := range vs { result = append(result, f(v)) } return result } // absInt returns the absolute value of an int. func absInt(v int) int { if v < 0 { return -v } return v } // looksLikeComponent returns true, if either a looks like a component of b, or vice versa. func looksLikeComponent(a, b string) bool { ac := strings.Split(a, ".") bc := strings.Split(b, ".") if len(ac) > 1 { if strings.Join(ac[0:len(ac)-1], ".") == b { return true } } if len(bc) > 1 { if strings.Join(bc[0:len(bc)-1], ".") == a { return true } } return false }