aboutsummaryrefslogtreecommitdiffstats
path: root/skate/verify.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/verify.go')
-rw-r--r--skate/verify.go735
1 files changed, 735 insertions, 0 deletions
diff --git a/skate/verify.go b/skate/verify.go
new file mode 100644
index 0000000..cd40279
--- /dev/null
+++ b/skate/verify.go
@@ -0,0 +1,735 @@
+// TODO: The various grouping and verification functions should probably be in
+// a separate file and it should be obvious how to adjust or write a new one.
+
+//go:generate stringer -type=Status,Reason -output verify_string.go verify.go
+package skate
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "regexp"
+ "strconv"
+ "strings"
+ "unicode/utf8"
+
+ "git.archive.org/martin/cgraph/skate/set"
+ "git.archive.org/martin/cgraph/skate/zipkey"
+)
+
+// This file is a port of fuzzycat.verify to Go.
+
+type (
+ // Status represents match strength.
+ Status int
+ // Reason gives more context to status result.
+ Reason int
+)
+
+const (
+ StatusUnknown Status = iota
+ StatusExact
+ StatusStrong
+ StatusWeak
+ StatusDifferent
+ StatusAmbiguous
+
+ ReasonUnknown Reason = iota
+ ReasonAppendix
+ ReasonArxiv
+ ReasonArxivVersion
+ ReasonBlacklisted
+ ReasonBlacklistedFragment
+ ReasonBookChapter
+ ReasonChemFormula
+ ReasonComponent
+ ReasonContainer
+ ReasonContainerNameBlacklist
+ ReasonContribIntersectionEmpty
+ ReasonCustomBSISubdoc
+ ReasonCustomBSIUndated
+ ReasonCustomIEEEArxiv
+ ReasonCustomIOPMAPattern
+ ReasonCustomPrefix1014288
+ ReasonCustomPrefix105860ChoiceReview
+ ReasonCustomPrefix107916
+ ReasonCustomVHS
+ ReasonDOI
+ ReasonDataciteRelatedID
+ ReasonDataciteVersion
+ ReasonDatasetDOI
+ ReasonFigshareVersion
+ ReasonJaccardAuthors
+ ReasonJstorID
+ ReasonMaxClusterSizeExceeded
+ ReasonNumDiff
+ ReasonPMCID
+ ReasonPMID
+ ReasonPMIDDOIPair
+ ReasonPageCount
+ ReasonPreprintPublished
+ ReasonPublisherBlacklist
+ ReasonReleaseType
+ ReasonSharedDOIPrefix
+ ReasonShortTitle
+ ReasonSingularCluster
+ ReasonSlugTitleAuthorMatch
+ ReasonSubtitle
+ ReasonTitleArtifact
+ ReasonTitleAuthorMatch
+ ReasonTitleFilename
+ ReasonTokenizedAuthors
+ ReasonVersionedDOI
+ ReasonWorkID
+ ReasonYear
+)
+
+// Short name.
+func (s Status) Short() string {
+ return strings.ToLower(strings.Replace(s.String(), "Status", "", 1))
+}
+
+// Short name.
+func (r Reason) Short() string {
+ return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1))
+}
+
+// MatchResult is the result of a verification.
+type MatchResult struct {
+ Status Status
+ Reason Reason
+}
+
+var (
+ PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
+ PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
+ PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
+ PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
+ PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
+ PatDigits = regexp.MustCompile(`\d+`)
+ PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
+)
+
+// XXX: add all pairs verification (e.g. self-match).
+
+// RefCluster deserialized a single cluster document and returns a tabular file
+// with identifiers, match status and reason.
+func RefCluster(p []byte) ([]byte, error) {
+ var (
+ cr *ClusterResult
+ buf bytes.Buffer
+ )
+ if err := json.Unmarshal(p, &cr); err != nil {
+ return nil, err
+ }
+ pivot, err := cr.OneNonRef()
+ if err != nil {
+ return nil, err
+ }
+ for _, re := range cr.Values {
+ if re.Extra.Skate.Status != "ref" {
+ continue
+ }
+ result := Verify(pivot, re, 5)
+ if _, err := fmt.Fprintf(&buf, "%s %s %s %s\n",
+ pivot.Ident, re.Ident, result.Status, result.Reason); err != nil {
+ return nil, err
+ }
+ // XXX: We can generate a biblioref here, too.
+ }
+ return buf.Bytes(), nil
+}
+
+// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches.
+func RefClusterToBiblioRef(p []byte) ([]byte, error) {
+ var (
+ cr *ClusterResult
+ br *BiblioRef
+ buf bytes.Buffer
+ )
+ if err := json.Unmarshal(p, &cr); err != nil {
+ return nil, err
+ }
+ pivot, err := cr.OneNonRef()
+ if err != nil {
+ return nil, err
+ }
+ for _, re := range cr.Values {
+ if re.Extra.Skate.Status != "ref" {
+ continue
+ }
+ result := Verify(pivot, re, 5)
+ switch result.Status {
+ case StatusExact, StatusStrong:
+ if result.Reason == ReasonDOI {
+ // Assume we already have the DOI matches.
+ continue
+ }
+ br = generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy")
+ b, err := json.Marshal(br)
+ if err != nil {
+ return nil, err
+ }
+ b = append(b, []byte("\n")...)
+ return b, nil
+ default:
+ continue
+ }
+ }
+ return buf.Bytes(), nil
+}
+
+// generateBiblioRef generates a bibliographic schema document.
+func generateBiblioRef(source, target *Release, matchStatus Status, matchReason Reason, provenance string) *BiblioRef {
+ var bref BiblioRef
+ bref.SourceReleaseIdent = source.Ident
+ bref.SourceWorkIdent = source.WorkID
+ bref.SourceReleaseStage = source.ReleaseStage
+ if source.ReleaseYear() > 1000 {
+ bref.SourceYear = source.ReleaseYearString()
+ }
+ bref.RefIndex = source.Extra.Skate.Ref.Index
+ bref.RefKey = source.Extra.Skate.Ref.Key
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = matchStatus.Short()
+ bref.MatchReason = matchReason.Short()
+ return &bref
+}
+
+// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ enc := json.NewEncoder(w)
+ keyer := func(s string) (string, error) {
+ if k := lineColumn(s, "\t", 2); k == "" {
+ return k, fmt.Errorf("cannot get key: %s", s)
+ } else {
+ return k, nil
+ }
+ }
+ grouper := func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ ref, err := stringToRef(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.SourceReleaseIdent = ref.ReleaseIdent
+ bref.SourceWorkIdent = ref.WorkIdent
+ bref.SourceReleaseStage = ref.ReleaseStage
+ bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
+ bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
+ bref.RefKey = ref.Key
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = mr.Status.Short()
+ bref.MatchReason = mr.Reason.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ zipper := zipkey.New(releases, refs, keyer, grouper)
+ return zipper.Run()
+}
+
+// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
+// and will execute gf for each group found.
+func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ enc := json.NewEncoder(w)
+ keyer := func(s string) (string, error) {
+ if k := lineColumn(s, "\t", 2); k == "" {
+ return k, fmt.Errorf("cannot get key: %s", s)
+ } else {
+ return k, nil
+ }
+ }
+ grouper := func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ re, err := stringToRelease(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ result := Verify(pivot, re, 5)
+ switch result.Status {
+ case StatusExact, StatusStrong:
+ if result.Reason == ReasonDOI {
+ continue
+ }
+ br := generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy")
+ if err := enc.Encode(br); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+ }
+ zipper := zipkey.New(releases, refs, keyer, grouper)
+ return zipper.Run()
+}
+
+// lineColumn returns a specific column (1-indexed, like cut) from a tabular
+// file, returns empty string if column is invalid.
+func lineColumn(line, sep string, column int) string {
+ var parts = strings.Split(strings.TrimSpace(line), sep)
+ if len(parts) < column {
+ return ""
+ } else {
+ return parts[column-1]
+ }
+}
+
+func stringToRelease(s string) (r *Release, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+func stringToRef(s string) (r *Ref, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+// Verify follows the fuzzycat (Python) implementation of this function: it
+// compares two release entities. The Go version can be used for large batch
+// processing (where the Python version might take two or more days).
+func Verify(a, b *Release, minTitleLength int) MatchResult {
+ if a.ExtIDs.DOI != "" && a.ExtIDs.DOI == b.ExtIDs.DOI {
+ return MatchResult{StatusExact, ReasonDOI}
+ }
+ if a.WorkID != "" && a.WorkID == b.WorkID {
+ return MatchResult{StatusExact, ReasonWorkID}
+ }
+ aTitleLower := strings.ToLower(a.Title)
+ bTitleLower := strings.ToLower(b.Title)
+ if utf8.RuneCountInString(a.Title) < minTitleLength {
+ return MatchResult{StatusAmbiguous, ReasonShortTitle}
+ }
+ if BlacklistTitle.Contains(aTitleLower) {
+ return MatchResult{StatusAmbiguous, ReasonBlacklisted}
+ }
+ if BlacklistTitle.Contains(bTitleLower) {
+ return MatchResult{StatusAmbiguous, ReasonBlacklisted}
+ }
+ for _, fragment := range BlacklistTitleFragments.Slice() {
+ if strings.Contains(aTitleLower, fragment) {
+ return MatchResult{StatusAmbiguous, ReasonBlacklistedFragment}
+ }
+ }
+ if strings.Contains(aTitleLower, "subject index") && strings.Contains(bTitleLower, "subject index") {
+ if a.ContainerID != "" && a.ContainerID != b.ContainerID {
+ return MatchResult{StatusDifferent, ReasonContainer}
+ }
+ }
+ if a.Title != "" && a.Title == b.Title &&
+ a.Extra.DataCite.MetadataVersion > 0 && b.Extra.DataCite.MetadataVersion > 0 &&
+ a.Extra.DataCite.MetadataVersion != b.Extra.DataCite.MetadataVersion {
+ return MatchResult{StatusExact, ReasonDataciteVersion}
+ }
+ if strings.HasPrefix(a.ExtIDs.DOI, "10.14288/") && strings.HasPrefix(b.ExtIDs.DOI, "10.14288/") &&
+ a.ExtIDs.DOI != b.ExtIDs.DOI {
+ return MatchResult{StatusDifferent, ReasonCustomPrefix1014288}
+ }
+ if strings.HasPrefix(a.ExtIDs.DOI, "10.3403") && strings.HasPrefix(b.ExtIDs.DOI, "10.3403") {
+ if a.ExtIDs.DOI+"u" == b.ExtIDs.DOI || b.ExtIDs.DOI+"u" == a.ExtIDs.DOI {
+ return MatchResult{StatusStrong, ReasonCustomBSIUndated}
+ }
+ aSubtitle := a.Subtitle()
+ bSubtitle := b.Subtitle()
+ if a.Title != "" && a.Title == b.Title &&
+ ((len(aSubtitle) > 0 && aSubtitle[0] != "" && len(bSubtitle) == 0) ||
+ (len(aSubtitle) == 0 && len(bSubtitle) > 0 && bSubtitle[0] != "")) {
+ return MatchResult{StatusStrong, ReasonCustomBSISubdoc}
+ }
+ }
+ if strings.HasPrefix(a.ExtIDs.DOI, "10.1149") && strings.HasPrefix(b.ExtIDs.DOI, "10.1149") {
+ v := "10.1149/ma"
+ if (strings.HasPrefix(a.ExtIDs.DOI, v) && !strings.HasPrefix(b.ExtIDs.DOI, v)) ||
+ (!strings.HasPrefix(a.ExtIDs.DOI, v) && strings.HasPrefix(b.ExtIDs.DOI, v)) {
+ return MatchResult{StatusDifferent, ReasonCustomIOPMAPattern}
+ }
+ }
+ if strings.Contains(a.Title, "Zweckverband Volkshochschule") && a.Title != b.Title {
+ return MatchResult{StatusDifferent, ReasonCustomVHS}
+ }
+ if PatAppendix.MatchString(a.Title) {
+ return MatchResult{StatusAmbiguous, ReasonAppendix}
+ }
+ if strings.HasPrefix(a.ExtIDs.DOI, "10.6084/") && strings.HasPrefix(b.ExtIDs.DOI, "10.6084/") {
+ av := PatFigshareVersion.ReplaceAllString(a.ExtIDs.DOI, "")
+ bv := PatFigshareVersion.ReplaceAllString(b.ExtIDs.DOI, "")
+ if av == bv {
+ return MatchResult{StatusStrong, ReasonFigshareVersion}
+ }
+ }
+ if PatVersionedDOI.MatchString(a.ExtIDs.DOI) && PatVersionedDOI.MatchString(b.ExtIDs.DOI) {
+ return MatchResult{StatusStrong, ReasonVersionedDOI}
+ }
+ if looksLikeComponent(a.ExtIDs.DOI, b.ExtIDs.DOI) {
+ return MatchResult{StatusStrong, ReasonVersionedDOI}
+ }
+ if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 {
+ getRelatedDOI := func(rel *Release) *set.Set {
+ ss := set.New()
+ for _, rel := range rel.Extra.DataCite.Relations {
+ if strings.ToLower(rel.RelatedIdentifierType) != "doi" {
+ continue
+ }
+ ss.Add(rel.RelatedIdentifier())
+ }
+ return ss
+ }
+ aRelated := getRelatedDOI(a)
+ bRelated := getRelatedDOI(b)
+ if aRelated.Contains(b.ExtIDs.DOI) || bRelated.Contains(a.ExtIDs.DOI) {
+ return MatchResult{StatusStrong, ReasonDataciteRelatedID}
+ }
+ }
+ if a.ExtIDs.Arxiv != "" && b.ExtIDs.Arxiv != "" {
+ aSub := PatArxivVersion.FindStringSubmatch(a.ExtIDs.Arxiv)
+ bSub := PatArxivVersion.FindStringSubmatch(b.ExtIDs.Arxiv)
+ if len(aSub) == 2 && len(bSub) == 2 && aSub[1] == bSub[1] {
+ return MatchResult{StatusStrong, ReasonArxivVersion}
+ }
+ }
+ if a.ReleaseType != b.ReleaseType {
+ types := set.FromSlice([]string{a.ReleaseType, b.ReleaseType})
+ ignoreTypes := set.FromSlice([]string{"article", "article-journal", "report", "paper-conference"})
+ if types.Intersection(ignoreTypes).IsEmpty() {
+ return MatchResult{StatusDifferent, ReasonReleaseType}
+ }
+ if types.Contains("dataset") && (types.Contains("article") || types.Contains("article-journal")) {
+ return MatchResult{StatusDifferent, ReasonReleaseType}
+ }
+ if types.Contains("book") && (types.Contains("article") || types.Contains("article-journal")) {
+ return MatchResult{StatusDifferent, ReasonReleaseType}
+ }
+ }
+ if a.ReleaseType == "dataset" && b.ReleaseType == "dataset" && a.ExtIDs.DOI != b.ExtIDs.DOI {
+ return MatchResult{StatusDifferent, ReasonDatasetDOI}
+ }
+ if a.ReleaseType == "chapter" && b.ReleaseType == "chapter" &&
+ a.Extra.ContainerName != "" && a.Extra.ContainerName != b.Extra.ContainerName {
+ return MatchResult{StatusDifferent, ReasonBookChapter}
+ }
+ if a.Extra.Crossref.Type == "component" && a.Title != b.Title {
+ return MatchResult{StatusDifferent, ReasonComponent}
+ }
+ if a.ReleaseType == "component" && b.ReleaseType == "component" {
+ if a.ExtIDs.DOI != "" && a.ExtIDs.DOI != b.ExtIDs.DOI {
+ return MatchResult{StatusDifferent, ReasonComponent}
+ }
+ }
+ aSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(a.Title), "\n", " ", -1))
+ bSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(b.Title), "\n", " ", -1))
+
+ if aSlugTitle == bSlugTitle {
+ if a.ReleaseYear() != 0 && b.ReleaseYear() != 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 40 {
+ return MatchResult{StatusDifferent, ReasonYear}
+ }
+ }
+ if aSlugTitle == bSlugTitle {
+ ieeeArxivCheck := func(a, b *Release) (ok bool) {
+ return doiPrefix(a.ExtIDs.DOI) == "10.1109" && b.ExtIDs.Arxiv != ""
+ }
+ if ieeeArxivCheck(a, b) || ieeeArxivCheck(b, a) {
+ return MatchResult{StatusStrong, ReasonCustomIEEEArxiv}
+ }
+ }
+ if aSlugTitle == bSlugTitle {
+ if strings.HasPrefix(a.ExtIDs.DOI, "10.7916/") && strings.HasPrefix(b.ExtIDs.DOI, "10.7916/") {
+ return MatchResult{StatusAmbiguous, ReasonCustomPrefix107916}
+ }
+ }
+ if aSlugTitle == bSlugTitle {
+ aSubtitle := a.Subtitle()
+ bSubtitle := b.Subtitle()
+ for _, aSub := range aSubtitle {
+ for _, bSub := range bSubtitle {
+ if slugifyString(aSub) != slugifyString(bSub) {
+ return MatchResult{StatusDifferent, ReasonSubtitle}
+ }
+ }
+ }
+ }
+ rawAuthors := func(rel *Release) (names []string) {
+ for _, c := range rel.Contribs {
+ name := strings.TrimSpace(c.RawName)
+ if name == "" {
+ continue
+ }
+ names = append(names, name)
+ }
+ return names
+ }
+ aAuthors := set.FromSlice(rawAuthors(a))
+ bAuthors := set.FromSlice(rawAuthors(b))
+ aSlugAuthors := set.FromSlice(mapString(slugifyString, aAuthors.Slice()))
+ bSlugAuthors := set.FromSlice(mapString(slugifyString, bAuthors.Slice()))
+ if aTitleLower == bTitleLower {
+ if aAuthors.Len() > 0 && aAuthors.Equals(bAuthors) {
+ if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 {
+ return MatchResult{StatusDifferent, ReasonYear}
+ }
+ return MatchResult{StatusExact, ReasonTitleAuthorMatch}
+ }
+ }
+ if looksLikeFilename(a.Title) || looksLikeFilename(b.Title) {
+ if a.Title != b.Title {
+ return MatchResult{StatusDifferent, ReasonTitleFilename}
+ }
+ }
+ if a.Title != "" && a.Title == b.Title {
+ if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 2 {
+ return MatchResult{StatusDifferent, ReasonYear}
+ }
+ }
+ // XXX: skipping chemical formula detection (to few cases; https://git.io/Jtdax)
+ if len(aSlugTitle) < 10 && aSlugTitle != bSlugTitle {
+ return MatchResult{StatusAmbiguous, ReasonShortTitle}
+ }
+ if PatDigits.MatchString(aSlugTitle) &&
+ aSlugTitle != bSlugTitle &&
+ unifyDigits(aSlugTitle) == unifyDigits(bSlugTitle) {
+ return MatchResult{StatusDifferent, ReasonNumDiff}
+ }
+ if aSlugTitle != "" && bSlugTitle != "" &&
+ strings.ReplaceAll(aSlugTitle, " ", "") == strings.ReplaceAll(bSlugTitle, " ", "") {
+ if aSlugAuthors.Intersection(bSlugAuthors).Len() > 0 {
+ if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 {
+ return MatchResult{StatusDifferent, ReasonYear}
+ }
+ return MatchResult{StatusStrong, ReasonSlugTitleAuthorMatch}
+ }
+ }
+ if a.ReleaseYear() > 0 && a.ReleaseYear() == b.ReleaseYear() && aTitleLower == bTitleLower {
+ if (a.ExtIDs.PMID != "" && b.ExtIDs.DOI != "") || (b.ExtIDs.PMID != "" && a.ExtIDs.DOI != "") {
+ return MatchResult{StatusStrong, ReasonPMIDDOIPair}
+ }
+ }
+ if a.ExtIDs.Jstor != "" && b.ExtIDs.Jstor != "" && a.ExtIDs.Jstor != b.ExtIDs.Jstor {
+ return MatchResult{StatusDifferent, ReasonJstorID}
+ }
+ if a.ContainerID != "" && a.ContainerID == b.ContainerID && a.ExtIDs.DOI != b.ExtIDs.DOI &&
+ doiPrefix(a.ExtIDs.DOI) != "10.1126" &&
+ doiPrefix(a.ExtIDs.DOI) == doiPrefix(b.ExtIDs.DOI) {
+ return MatchResult{StatusDifferent, ReasonSharedDOIPrefix}
+ }
+ if aAuthors.Len() > 0 && aSlugAuthors.Intersection(bSlugAuthors).IsEmpty() {
+ numAuthors := set.Min(aSlugAuthors, bSlugAuthors)
+ score := averageScore(aSlugAuthors, bSlugAuthors)
+ if (numAuthors < 3 && score > 0.9) || (numAuthors >= 3 && score > 0.5) {
+ return MatchResult{StatusStrong, ReasonTokenizedAuthors}
+ }
+ aTok := set.FromSlice(strings.Fields(aSlugAuthors.Join(" ")))
+ bTok := set.FromSlice(strings.Fields(bSlugAuthors.Join(" ")))
+ aTok = set.Filter(aTok, func(s string) bool {
+ return len(s) > 2
+ })
+ bTok = set.Filter(bTok, func(s string) bool {
+ return len(s) > 2
+ })
+ if aTok.Len() > 0 && bTok.Len() > 0 {
+ if aTok.Jaccard(bTok) > 0.35 {
+ return MatchResult{StatusStrong, ReasonJaccardAuthors}
+ }
+ }
+ return MatchResult{StatusDifferent, ReasonContribIntersectionEmpty}
+ }
+ if doiPrefix(a.ExtIDs.DOI) == "10.5860" || doiPrefix(b.ExtIDs.DOI) == "10.5860" {
+ return MatchResult{StatusAmbiguous, ReasonCustomPrefix105860ChoiceReview}
+ }
+ // XXX: parse pages
+ aParsedPages := parsePageString(a.Pages)
+ bParsedPages := parsePageString(b.Pages)
+ if aParsedPages.Err != nil && bParsedPages.Err != nil {
+ if absInt(aParsedPages.Count()-bParsedPages.Count()) > 5 {
+ return MatchResult{StatusDifferent, ReasonPageCount}
+ }
+ }
+ if aAuthors.Equals(bAuthors) &&
+ a.ContainerID == b.ContainerID &&
+ a.ReleaseYear() == b.ReleaseYear() &&
+ a.Title != b.Title &&
+ (strings.Contains(a.Title, b.Title) || strings.Contains(b.Title, a.Title)) {
+ return MatchResult{StatusStrong, ReasonTitleArtifact}
+ }
+ return MatchResult{
+ StatusAmbiguous,
+ ReasonUnknown,
+ }
+}
+
+type ParsedPages struct {
+ Start int
+ End int
+ Err error
+}
+
+func (pp *ParsedPages) Count() int {
+ return pp.End - pp.Start + 1
+}
+
+func parsePageString(s string) *ParsedPages {
+ s = strings.TrimSpace(s)
+ var pp = ParsedPages{}
+ if len(s) == 0 {
+ pp.Err = fmt.Errorf("parse pages: empty string")
+ return &pp
+ }
+ matches := PatPages.FindStringSubmatch(s)
+ if len(matches) != 3 {
+ pp.Err = fmt.Errorf("parse pages: no page pattern")
+ return &pp
+ }
+ start, end := matches[1], matches[2]
+ if len(end) == 1 && len(start) > 1 && start[len(start)-1] < end[0] {
+ end = fmt.Sprintf("%s%c", start[:len(start)-1], end[0])
+ }
+ if pp.Start, pp.Err = strconv.Atoi(start); pp.Err != nil {
+ return &pp
+ }
+ if pp.End, pp.Err = strconv.Atoi(end); pp.Err != nil {
+ return &pp
+ }
+ if pp.Start > pp.End {
+ pp.Err = fmt.Errorf("invalid page count: %s", s)
+ }
+ return &pp
+}
+
+// averageScore take a limited set of authors and calculates pairwise
+// similarity scores, then returns the average of the best scores; between 0
+// and 1.
+func averageScore(a, b *set.Set) float64 {
+ aTrimmed := a.TopK(5)
+ bTrimmed := b.TopK(5)
+ maxScores := make(map[string]float64) // For each a, keep the max.
+ for _, pair := range aTrimmed.Product(bTrimmed) {
+ a, b := pair[0], pair[1]
+ score := authorSimilarityScore(a, b)
+ if v, ok := maxScores[a]; !ok || score > v {
+ maxScores[a] = score
+ }
+ }
+ var sum, avg float64
+ for _, v := range maxScores {
+ sum += v
+ }
+ avg = sum / float64(len(maxScores))
+ return avg
+}
+
+// authorSimilarityScore is a hacky similarity score.
+func authorSimilarityScore(s, t string) float64 {
+ ss := set.FromSlice(tokenNgrams(s, 2))
+ ts := set.FromSlice(tokenNgrams(t, 2))
+ return ss.Jaccard(ts)
+}
+
+// tokenNgrams are groups of n tokens per token in string, e.g. for n=2 and
+// string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
+func tokenNgrams(s string, n int) (result []string) {
+ var buf bytes.Buffer
+ for _, token := range tokenizeString(s) {
+ buf.Reset()
+ for i, c := range token {
+ if i > 0 && i%n == 0 {
+ result = append(result, buf.String())
+ buf.Reset()
+ }
+ buf.WriteRune(c) // XXX: skipping error handling
+ }
+ result = append(result, buf.String())
+ }
+ return
+}
+
+func tokenizeString(s string) []string {
+ return strings.Fields(strings.ToLower(s))
+}
+
+func doiPrefix(s string) string {
+ parts := strings.Split(s, "/")
+ return parts[0]
+}
+
+// unifyDigits replaces all digit groups with a placeholder, e.g. "<NUM>".
+func unifyDigits(s string) string {
+ return PatDigits.ReplaceAllString(s, "<NUM>")
+}
+
+// looksLikeFilename returns true, if the given string could be a filename.
+func looksLikeFilename(s string) bool {
+ if len(strings.Fields(s)) > 1 {
+ return false
+ }
+ return PatFilenameLike.MatchString(s)
+}
+
+// mapString applies a function on each element of a string slice.
+func mapString(f func(string) string, vs []string) (result []string) {
+ for _, v := range vs {
+ result = append(result, f(v))
+ }
+ return result
+}
+
+// absInt returns the absolute value of an int.
+func absInt(v int) int {
+ if v < 0 {
+ return -v
+ }
+ return v
+}
+
+// slugifyString is a basic string slugifier.
+func slugifyString(s string) string {
+ var buf bytes.Buffer
+ for _, c := range strings.TrimSpace(strings.ToLower(s)) {
+ if (c > 96 && c < 123) || (c > 47 && c < 58) || (c == 32) || (c == 9) || (c == 10) {
+ fmt.Fprintf(&buf, "%c", c)
+ }
+ }
+ return strings.Join(strings.Fields(buf.String()), " ")
+}
+
+// looksLikeComponent returns true, if either a looks like a component of b, or vice versa.
+func looksLikeComponent(a, b string) bool {
+ ac := strings.Split(a, ".")
+ bc := strings.Split(b, ".")
+ if len(ac) > 1 {
+ if strings.Join(ac[0:len(ac)-1], ".") == b {
+ return true
+ }
+ }
+ if len(bc) > 1 {
+ if strings.Join(bc[0:len(bc)-1], ".") == a {
+ return true
+ }
+ }
+ return false
+}