1 files changed, 735 insertions, 0 deletions
diff --git a/skate/verify.go b/skate/verify.go
new file mode 100644
index 0000000..cd40279
--- /dev/null
+++ b/skate/verify.go
@@ -0,0 +1,735 @@
+// TODO: The various grouping and verification functions should probably be in
+// a separate file and it should be obvious how to adjust or write a new one.
+
+//go:generate stringer -type=Status,Reason -output verify_string.go verify.go
+package skate
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"regexp"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+
+	"git.archive.org/martin/cgraph/skate/set"
+	"git.archive.org/martin/cgraph/skate/zipkey"
+)
+
+// This file is a port of fuzzycat.verify to Go.
+
+type (
+	// Status represents match strength.
+	Status int
+	// Reason gives more context to status result.
+	Reason int
+)
+
+const (
+	StatusUnknown Status = iota
+	StatusExact
+	StatusStrong
+	StatusWeak
+	StatusDifferent
+	StatusAmbiguous
+
+	ReasonUnknown Reason = iota
+	ReasonAppendix
+	ReasonArxiv
+	ReasonArxivVersion
+	ReasonBlacklisted
+	ReasonBlacklistedFragment
+	ReasonBookChapter
+	ReasonChemFormula
+	ReasonComponent
+	ReasonContainer
+	ReasonContainerNameBlacklist
+	ReasonContribIntersectionEmpty
+	ReasonCustomBSISubdoc
+	ReasonCustomBSIUndated
+	ReasonCustomIEEEArxiv
+	ReasonCustomIOPMAPattern
+	ReasonCustomPrefix1014288
+	ReasonCustomPrefix105860ChoiceReview
+	ReasonCustomPrefix107916
+	ReasonCustomVHS
+	ReasonDOI
+	ReasonDataciteRelatedID
+	ReasonDataciteVersion
+	ReasonDatasetDOI
+	ReasonFigshareVersion
+	ReasonJaccardAuthors
+	ReasonJstorID
+	ReasonMaxClusterSizeExceeded
+	ReasonNumDiff
+	ReasonPMCID
+	ReasonPMID
+	ReasonPMIDDOIPair
+	ReasonPageCount
+	ReasonPreprintPublished
+	ReasonPublisherBlacklist
+	ReasonReleaseType
+	ReasonSharedDOIPrefix
+	ReasonShortTitle
+	ReasonSingularCluster
+	ReasonSlugTitleAuthorMatch
+	ReasonSubtitle
+	ReasonTitleArtifact
+	ReasonTitleAuthorMatch
+	ReasonTitleFilename
+	ReasonTokenizedAuthors
+	ReasonVersionedDOI
+	ReasonWorkID
+	ReasonYear
+)
+
+// Short name.
+func (s Status) Short() string {
+	return strings.ToLower(strings.Replace(s.String(), "Status", "", 1))
+}
+
+// Short name.
+func (r Reason) Short() string {
+	return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1))
+}
+
+// MatchResult is the result of a verification.
+type MatchResult struct {
+	Status Status
+	Reason Reason
+}
+
+var (
+	PatAppendix        = regexp.MustCompile(`appendix ?[^ ]*$`)
+	PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
+	PatVersionedDOI    = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
+	PatArxivVersion    = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
+	PatFilenameLike    = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
+	PatDigits          = regexp.MustCompile(`\d+`)
+	PatPages           = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
+)
+
+// XXX: add all pairs verification (e.g. self-match).
+
+// RefCluster deserialized a single cluster document and returns a tabular file
+// with identifiers, match status and reason.
+func RefCluster(p []byte) ([]byte, error) {
+	var (
+		cr  *ClusterResult
+		buf bytes.Buffer
+	)
+	if err := json.Unmarshal(p, &cr); err != nil {
+		return nil, err
+	}
+	pivot, err := cr.OneNonRef()
+	if err != nil {
+		return nil, err
+	}
+	for _, re := range cr.Values {
+		if re.Extra.Skate.Status != "ref" {
+			continue
+		}
+		result := Verify(pivot, re, 5)
+		if _, err := fmt.Fprintf(&buf, "%s %s %s %s\n",
+			pivot.Ident, re.Ident, result.Status, result.Reason); err != nil {
+			return nil, err
+		}
+		// XXX: We can generate a biblioref here, too.
+	}
+	return buf.Bytes(), nil
+}
+
+// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches.
+func RefClusterToBiblioRef(p []byte) ([]byte, error) {
+	var (
+		cr  *ClusterResult
+		br  *BiblioRef
+		buf bytes.Buffer
+	)
+	if err := json.Unmarshal(p, &cr); err != nil {
+		return nil, err
+	}
+	pivot, err := cr.OneNonRef()
+	if err != nil {
+		return nil, err
+	}
+	for _, re := range cr.Values {
+		if re.Extra.Skate.Status != "ref" {
+			continue
+		}
+		result := Verify(pivot, re, 5)
+		switch result.Status {
+		case StatusExact, StatusStrong:
+			if result.Reason == ReasonDOI {
+				// Assume we already have the DOI matches.
+				continue
+			}
+			br = generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy")
+			b, err := json.Marshal(br)
+			if err != nil {
+				return nil, err
+			}
+			b = append(b, []byte("\n")...)
+			return b, nil
+		default:
+			continue
+		}
+	}
+	return buf.Bytes(), nil
+}
+
+// generateBiblioRef generates a bibliographic schema document.
+func generateBiblioRef(source, target *Release, matchStatus Status, matchReason Reason, provenance string) *BiblioRef {
+	var bref BiblioRef
+	bref.SourceReleaseIdent = source.Ident
+	bref.SourceWorkIdent = source.WorkID
+	bref.SourceReleaseStage = source.ReleaseStage
+	if source.ReleaseYear() > 1000 {
+		bref.SourceYear = source.ReleaseYearString()
+	}
+	bref.RefIndex = source.Extra.Skate.Ref.Index
+	bref.RefKey = source.Extra.Skate.Ref.Key
+	bref.TargetReleaseIdent = target.Ident
+	bref.TargetWorkIdent = target.WorkID
+	bref.MatchProvenance = provenance
+	bref.MatchStatus = matchStatus.Short()
+	bref.MatchReason = matchReason.Short()
+	return &bref
+}
+
+// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+	// Define a grouper, working on one set of refs and releases with the same
+	// key at a time. Here, we do verification and write out the generated
+	// biblioref.
+	enc := json.NewEncoder(w)
+	keyer := func(s string) (string, error) {
+		if k := lineColumn(s, "\t", 2); k == "" {
+			return k, fmt.Errorf("cannot get key: %s", s)
+		} else {
+			return k, nil
+		}
+	}
+	grouper := func(g *zipkey.Group) error {
+		if len(g.G0) == 0 || len(g.G1) == 0 {
+			return nil
+		}
+		target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+		if err != nil {
+			return err
+		}
+		for _, line := range g.G1 {
+			ref, err := stringToRef(lineColumn(line, "\t", 3))
+			if err != nil {
+				return err
+			}
+			var bref BiblioRef
+			bref.SourceReleaseIdent = ref.ReleaseIdent
+			bref.SourceWorkIdent = ref.WorkIdent
+			bref.SourceReleaseStage = ref.ReleaseStage
+			bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
+			bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
+			bref.RefKey = ref.Key
+			bref.TargetReleaseIdent = target.Ident
+			bref.TargetWorkIdent = target.WorkID
+			bref.MatchProvenance = provenance
+			bref.MatchStatus = mr.Status.Short()
+			bref.MatchReason = mr.Reason.Short()
+			if err := enc.Encode(bref); err != nil {
+				return err
+			}
+		}
+		return nil
+	}
+	zipper := zipkey.New(releases, refs, keyer, grouper)
+	return zipper.Run()
+}
+
+// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
+// and will execute gf for each group found.
+func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
+	// Define a grouper, working on one set of refs and releases with the same
+	// key at a time. Here, we do verification and write out the generated
+	// biblioref.
+	enc := json.NewEncoder(w)
+	keyer := func(s string) (string, error) {
+		if k := lineColumn(s, "\t", 2); k == "" {
+			return k, fmt.Errorf("cannot get key: %s", s)
+		} else {
+			return k, nil
+		}
+	}
+	grouper := func(g *zipkey.Group) error {
+		if len(g.G0) == 0 || len(g.G1) == 0 {
+			return nil
+		}
+		pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+		if err != nil {
+			return err
+		}
+		for _, line := range g.G1 {
+			re, err := stringToRelease(lineColumn(line, "\t", 3))
+			if err != nil {
+				return err
+			}
+			result := Verify(pivot, re, 5)
+			switch result.Status {
+			case StatusExact, StatusStrong:
+				if result.Reason == ReasonDOI {
+					continue
+				}
+				br := generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy")
+				if err := enc.Encode(br); err != nil {
+					return err
+				}
+			}
+		}
+		return nil
+	}
+	zipper := zipkey.New(releases, refs, keyer, grouper)
+	return zipper.Run()
+}
+
+// lineColumn returns a specific column (1-indexed, like cut) from a tabular
+// file, returns empty string if column is invalid.
+func lineColumn(line, sep string, column int) string {
+	var parts = strings.Split(strings.TrimSpace(line), sep)
+	if len(parts) < column {
+		return ""
+	} else {
+		return parts[column-1]
+	}
+}
+
+func stringToRelease(s string) (r *Release, err error) {
+	err = json.Unmarshal([]byte(s), &r)
+	return
+}
+
+func stringToRef(s string) (r *Ref, err error) {
+	err = json.Unmarshal([]byte(s), &r)
+	return
+}
+
+// Verify follows the fuzzycat (Python) implementation of this function: it
+// compares two release entities. The Go version can be used for large batch
+// processing (where the Python version might take two or more days).
+func Verify(a, b *Release, minTitleLength int) MatchResult {
+	if a.ExtIDs.DOI != "" && a.ExtIDs.DOI == b.ExtIDs.DOI {
+		return MatchResult{StatusExact, ReasonDOI}
+	}
+	if a.WorkID != "" && a.WorkID == b.WorkID {
+		return MatchResult{StatusExact, ReasonWorkID}
+	}
+	aTitleLower := strings.ToLower(a.Title)
+	bTitleLower := strings.ToLower(b.Title)
+	if utf8.RuneCountInString(a.Title) < minTitleLength {
+		return MatchResult{StatusAmbiguous, ReasonShortTitle}
+	}
+	if BlacklistTitle.Contains(aTitleLower) {
+		return MatchResult{StatusAmbiguous, ReasonBlacklisted}
+	}
+	if BlacklistTitle.Contains(bTitleLower) {
+		return MatchResult{StatusAmbiguous, ReasonBlacklisted}
+	}
+	for _, fragment := range BlacklistTitleFragments.Slice() {
+		if strings.Contains(aTitleLower, fragment) {
+			return MatchResult{StatusAmbiguous, ReasonBlacklistedFragment}
+		}
+	}
+	if strings.Contains(aTitleLower, "subject index") && strings.Contains(bTitleLower, "subject index") {
+		if a.ContainerID != "" && a.ContainerID != b.ContainerID {
+			return MatchResult{StatusDifferent, ReasonContainer}
+		}
+	}
+	if a.Title != "" && a.Title == b.Title &&
+		a.Extra.DataCite.MetadataVersion > 0 && b.Extra.DataCite.MetadataVersion > 0 &&
+		a.Extra.DataCite.MetadataVersion != b.Extra.DataCite.MetadataVersion {
+		return MatchResult{StatusExact, ReasonDataciteVersion}
+	}
+	if strings.HasPrefix(a.ExtIDs.DOI, "10.14288/") && strings.HasPrefix(b.ExtIDs.DOI, "10.14288/") &&
+		a.ExtIDs.DOI != b.ExtIDs.DOI {
+		return MatchResult{StatusDifferent, ReasonCustomPrefix1014288}
+	}
+	if strings.HasPrefix(a.ExtIDs.DOI, "10.3403") && strings.HasPrefix(b.ExtIDs.DOI, "10.3403") {
+		if a.ExtIDs.DOI+"u" == b.ExtIDs.DOI || b.ExtIDs.DOI+"u" == a.ExtIDs.DOI {
+			return MatchResult{StatusStrong, ReasonCustomBSIUndated}
+		}
+		aSubtitle := a.Subtitle()
+		bSubtitle := b.Subtitle()
+		if a.Title != "" && a.Title == b.Title &&
+			((len(aSubtitle) > 0 && aSubtitle[0] != "" && len(bSubtitle) == 0) ||
+				(len(aSubtitle) == 0 && len(bSubtitle) > 0 && bSubtitle[0] != "")) {
+			return MatchResult{StatusStrong, ReasonCustomBSISubdoc}
+		}
+	}
+	if strings.HasPrefix(a.ExtIDs.DOI, "10.1149") && strings.HasPrefix(b.ExtIDs.DOI, "10.1149") {
+		v := "10.1149/ma"
+		if (strings.HasPrefix(a.ExtIDs.DOI, v) && !strings.HasPrefix(b.ExtIDs.DOI, v)) ||
+			(!strings.HasPrefix(a.ExtIDs.DOI, v) && strings.HasPrefix(b.ExtIDs.DOI, v)) {
+			return MatchResult{StatusDifferent, ReasonCustomIOPMAPattern}
+		}
+	}
+	if strings.Contains(a.Title, "Zweckverband Volkshochschule") && a.Title != b.Title {
+		return MatchResult{StatusDifferent, ReasonCustomVHS}
+	}
+	if PatAppendix.MatchString(a.Title) {
+		return MatchResult{StatusAmbiguous, ReasonAppendix}
+	}
+	if strings.HasPrefix(a.ExtIDs.DOI, "10.6084/") && strings.HasPrefix(b.ExtIDs.DOI, "10.6084/") {
+		av := PatFigshareVersion.ReplaceAllString(a.ExtIDs.DOI, "")
+		bv := PatFigshareVersion.ReplaceAllString(b.ExtIDs.DOI, "")
+		if av == bv {
+			return MatchResult{StatusStrong, ReasonFigshareVersion}
+		}
+	}
+	if PatVersionedDOI.MatchString(a.ExtIDs.DOI) && PatVersionedDOI.MatchString(b.ExtIDs.DOI) {
+		return MatchResult{StatusStrong, ReasonVersionedDOI}
+	}
+	if looksLikeComponent(a.ExtIDs.DOI, b.ExtIDs.DOI) {
+		return MatchResult{StatusStrong, ReasonVersionedDOI}
+	}
+	if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 {
+		getRelatedDOI := func(rel *Release) *set.Set {
+			ss := set.New()
+			for _, rel := range rel.Extra.DataCite.Relations {
+				if strings.ToLower(rel.RelatedIdentifierType) != "doi" {
+					continue
+				}
+				ss.Add(rel.RelatedIdentifier())
+			}
+			return ss
+		}
+		aRelated := getRelatedDOI(a)
+		bRelated := getRelatedDOI(b)
+		if aRelated.Contains(b.ExtIDs.DOI) || bRelated.Contains(a.ExtIDs.DOI) {
+			return MatchResult{StatusStrong, ReasonDataciteRelatedID}
+		}
+	}
+	if a.ExtIDs.Arxiv != "" && b.ExtIDs.Arxiv != "" {
+		aSub := PatArxivVersion.FindStringSubmatch(a.ExtIDs.Arxiv)
+		bSub := PatArxivVersion.FindStringSubmatch(b.ExtIDs.Arxiv)
+		if len(aSub) == 2 && len(bSub) == 2 && aSub[1] == bSub[1] {
+			return MatchResult{StatusStrong, ReasonArxivVersion}
+		}
+	}
+	if a.ReleaseType != b.ReleaseType {
+		types := set.FromSlice([]string{a.ReleaseType, b.ReleaseType})
+		ignoreTypes := set.FromSlice([]string{"article", "article-journal", "report", "paper-conference"})
+		if types.Intersection(ignoreTypes).IsEmpty() {
+			return MatchResult{StatusDifferent, ReasonReleaseType}
+		}
+		if types.Contains("dataset") && (types.Contains("article") || types.Contains("article-journal")) {
+			return MatchResult{StatusDifferent, ReasonReleaseType}
+		}
+		if types.Contains("book") && (types.Contains("article") || types.Contains("article-journal")) {
+			return MatchResult{StatusDifferent, ReasonReleaseType}
+		}
+	}
+	if a.ReleaseType == "dataset" && b.ReleaseType == "dataset" && a.ExtIDs.DOI != b.ExtIDs.DOI {
+		return MatchResult{StatusDifferent, ReasonDatasetDOI}
+	}
+	if a.ReleaseType == "chapter" && b.ReleaseType == "chapter" &&
+		a.Extra.ContainerName != "" && a.Extra.ContainerName != b.Extra.ContainerName {
+		return MatchResult{StatusDifferent, ReasonBookChapter}
+	}
+	if a.Extra.Crossref.Type == "component" && a.Title != b.Title {
+		return MatchResult{StatusDifferent, ReasonComponent}
+	}
+	if a.ReleaseType == "component" && b.ReleaseType == "component" {
+		if a.ExtIDs.DOI != "" && a.ExtIDs.DOI != b.ExtIDs.DOI {
+			return MatchResult{StatusDifferent, ReasonComponent}
+		}
+	}
+	aSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(a.Title), "\n", " ", -1))
+	bSlugTitle := strings.TrimSpace(strings.Replace(slugifyString(b.Title), "\n", " ", -1))
+
+	if aSlugTitle == bSlugTitle {
+		if a.ReleaseYear() != 0 && b.ReleaseYear() != 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 40 {
+			return MatchResult{StatusDifferent, ReasonYear}
+		}
+	}
+	if aSlugTitle == bSlugTitle {
+		ieeeArxivCheck := func(a, b *Release) (ok bool) {
+			return doiPrefix(a.ExtIDs.DOI) == "10.1109" && b.ExtIDs.Arxiv != ""
+		}
+		if ieeeArxivCheck(a, b) || ieeeArxivCheck(b, a) {
+			return MatchResult{StatusStrong, ReasonCustomIEEEArxiv}
+		}
+	}
+	if aSlugTitle == bSlugTitle {
+		if strings.HasPrefix(a.ExtIDs.DOI, "10.7916/") && strings.HasPrefix(b.ExtIDs.DOI, "10.7916/") {
+			return MatchResult{StatusAmbiguous, ReasonCustomPrefix107916}
+		}
+	}
+	if aSlugTitle == bSlugTitle {
+		aSubtitle := a.Subtitle()
+		bSubtitle := b.Subtitle()
+		for _, aSub := range aSubtitle {
+			for _, bSub := range bSubtitle {
+				if slugifyString(aSub) != slugifyString(bSub) {
+					return MatchResult{StatusDifferent, ReasonSubtitle}
+				}
+			}
+		}
+	}
+	rawAuthors := func(rel *Release) (names []string) {
+		for _, c := range rel.Contribs {
+			name := strings.TrimSpace(c.RawName)
+			if name == "" {
+				continue
+			}
+			names = append(names, name)
+		}
+		return names
+	}
+	aAuthors := set.FromSlice(rawAuthors(a))
+	bAuthors := set.FromSlice(rawAuthors(b))
+	aSlugAuthors := set.FromSlice(mapString(slugifyString, aAuthors.Slice()))
+	bSlugAuthors := set.FromSlice(mapString(slugifyString, bAuthors.Slice()))
+	if aTitleLower == bTitleLower {
+		if aAuthors.Len() > 0 && aAuthors.Equals(bAuthors) {
+			if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 {
+				return MatchResult{StatusDifferent, ReasonYear}
+			}
+			return MatchResult{StatusExact, ReasonTitleAuthorMatch}
+		}
+	}
+	if looksLikeFilename(a.Title) || looksLikeFilename(b.Title) {
+		if a.Title != b.Title {
+			return MatchResult{StatusDifferent, ReasonTitleFilename}
+		}
+	}
+	if a.Title != "" && a.Title == b.Title {
+		if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 2 {
+			return MatchResult{StatusDifferent, ReasonYear}
+		}
+	}
+	// XXX: skipping chemical formula detection (to few cases; https://git.io/Jtdax)
+	if len(aSlugTitle) < 10 && aSlugTitle != bSlugTitle {
+		return MatchResult{StatusAmbiguous, ReasonShortTitle}
+	}
+	if PatDigits.MatchString(aSlugTitle) &&
+		aSlugTitle != bSlugTitle &&
+		unifyDigits(aSlugTitle) == unifyDigits(bSlugTitle) {
+		return MatchResult{StatusDifferent, ReasonNumDiff}
+	}
+	if aSlugTitle != "" && bSlugTitle != "" &&
+		strings.ReplaceAll(aSlugTitle, " ", "") == strings.ReplaceAll(bSlugTitle, " ", "") {
+		if aSlugAuthors.Intersection(bSlugAuthors).Len() > 0 {
+			if a.ReleaseYear() > 0 && b.ReleaseYear() > 0 && absInt(a.ReleaseYear()-b.ReleaseYear()) > 4 {
+				return MatchResult{StatusDifferent, ReasonYear}
+			}
+			return MatchResult{StatusStrong, ReasonSlugTitleAuthorMatch}
+		}
+	}
+	if a.ReleaseYear() > 0 && a.ReleaseYear() == b.ReleaseYear() && aTitleLower == bTitleLower {
+		if (a.ExtIDs.PMID != "" && b.ExtIDs.DOI != "") || (b.ExtIDs.PMID != "" && a.ExtIDs.DOI != "") {
+			return MatchResult{StatusStrong, ReasonPMIDDOIPair}
+		}
+	}
+	if a.ExtIDs.Jstor != "" && b.ExtIDs.Jstor != "" && a.ExtIDs.Jstor != b.ExtIDs.Jstor {
+		return MatchResult{StatusDifferent, ReasonJstorID}
+	}
+	if a.ContainerID != "" && a.ContainerID == b.ContainerID && a.ExtIDs.DOI != b.ExtIDs.DOI &&
+		doiPrefix(a.ExtIDs.DOI) != "10.1126" &&
+		doiPrefix(a.ExtIDs.DOI) == doiPrefix(b.ExtIDs.DOI) {
+		return MatchResult{StatusDifferent, ReasonSharedDOIPrefix}
+	}
+	if aAuthors.Len() > 0 && aSlugAuthors.Intersection(bSlugAuthors).IsEmpty() {
+		numAuthors := set.Min(aSlugAuthors, bSlugAuthors)
+		score := averageScore(aSlugAuthors, bSlugAuthors)
+		if (numAuthors < 3 && score > 0.9) || (numAuthors >= 3 && score > 0.5) {
+			return MatchResult{StatusStrong, ReasonTokenizedAuthors}
+		}
+		aTok := set.FromSlice(strings.Fields(aSlugAuthors.Join(" ")))
+		bTok := set.FromSlice(strings.Fields(bSlugAuthors.Join(" ")))
+		aTok = set.Filter(aTok, func(s string) bool {
+			return len(s) > 2
+		})
+		bTok = set.Filter(bTok, func(s string) bool {
+			return len(s) > 2
+		})
+		if aTok.Len() > 0 && bTok.Len() > 0 {
+			if aTok.Jaccard(bTok) > 0.35 {
+				return MatchResult{StatusStrong, ReasonJaccardAuthors}
+			}
+		}
+		return MatchResult{StatusDifferent, ReasonContribIntersectionEmpty}
+	}
+	if doiPrefix(a.ExtIDs.DOI) == "10.5860" || doiPrefix(b.ExtIDs.DOI) == "10.5860" {
+		return MatchResult{StatusAmbiguous, ReasonCustomPrefix105860ChoiceReview}
+	}
+	// XXX: parse pages
+	aParsedPages := parsePageString(a.Pages)
+	bParsedPages := parsePageString(b.Pages)
+	if aParsedPages.Err != nil && bParsedPages.Err != nil {
+		if absInt(aParsedPages.Count()-bParsedPages.Count()) > 5 {
+			return MatchResult{StatusDifferent, ReasonPageCount}
+		}
+	}
+	if aAuthors.Equals(bAuthors) &&
+		a.ContainerID == b.ContainerID &&
+		a.ReleaseYear() == b.ReleaseYear() &&
+		a.Title != b.Title &&
+		(strings.Contains(a.Title, b.Title) || strings.Contains(b.Title, a.Title)) {
+		return MatchResult{StatusStrong, ReasonTitleArtifact}
+	}
+	return MatchResult{
+		StatusAmbiguous,
+		ReasonUnknown,
+	}
+}
+
+type ParsedPages struct {
+	Start int
+	End   int
+	Err   error
+}
+
+func (pp *ParsedPages) Count() int {
+	return pp.End - pp.Start + 1
+}
+
+func parsePageString(s string) *ParsedPages {
+	s = strings.TrimSpace(s)
+	var pp = ParsedPages{}
+	if len(s) == 0 {
+		pp.Err = fmt.Errorf("parse pages: empty string")
+		return &pp
+	}
+	matches := PatPages.FindStringSubmatch(s)
+	if len(matches) != 3 {
+		pp.Err = fmt.Errorf("parse pages: no page pattern")
+		return &pp
+	}
+	start, end := matches[1], matches[2]
+	if len(end) == 1 && len(start) > 1 && start[len(start)-1] < end[0] {
+		end = fmt.Sprintf("%s%c", start[:len(start)-1], end[0])
+	}
+	if pp.Start, pp.Err = strconv.Atoi(start); pp.Err != nil {
+		return &pp
+	}
+	if pp.End, pp.Err = strconv.Atoi(end); pp.Err != nil {
+		return &pp
+	}
+	if pp.Start > pp.End {
+		pp.Err = fmt.Errorf("invalid page count: %s", s)
+	}
+	return &pp
+}
+
+// averageScore take a limited set of authors and calculates pairwise
+// similarity scores, then returns the average of the best scores; between 0
+// and 1.
+func averageScore(a, b *set.Set) float64 {
+	aTrimmed := a.TopK(5)
+	bTrimmed := b.TopK(5)
+	maxScores := make(map[string]float64) // For each a, keep the max.
+	for _, pair := range aTrimmed.Product(bTrimmed) {
+		a, b := pair[0], pair[1]
+		score := authorSimilarityScore(a, b)
+		if v, ok := maxScores[a]; !ok || score > v {
+			maxScores[a] = score
+		}
+	}
+	var sum, avg float64
+	for _, v := range maxScores {
+		sum += v
+	}
+	avg = sum / float64(len(maxScores))
+	return avg
+}
+
+// authorSimilarityScore is a hacky similarity score.
+func authorSimilarityScore(s, t string) float64 {
+	ss := set.FromSlice(tokenNgrams(s, 2))
+	ts := set.FromSlice(tokenNgrams(t, 2))
+	return ss.Jaccard(ts)
+}
+
+// tokenNgrams are groups of n tokens per token in string, e.g. for n=2 and
+// string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
+func tokenNgrams(s string, n int) (result []string) {
+	var buf bytes.Buffer
+	for _, token := range tokenizeString(s) {
+		buf.Reset()
+		for i, c := range token {
+			if i > 0 && i%n == 0 {
+				result = append(result, buf.String())
+				buf.Reset()
+			}
+			buf.WriteRune(c) // XXX: skipping error handling
+		}
+		result = append(result, buf.String())
+	}
+	return
+}
+
+func tokenizeString(s string) []string {
+	return strings.Fields(strings.ToLower(s))
+}
+
+func doiPrefix(s string) string {
+	parts := strings.Split(s, "/")
+	return parts[0]
+}
+
+// unifyDigits replaces all digit groups with a placeholder, e.g. "<NUM>".
+func unifyDigits(s string) string {
+	return PatDigits.ReplaceAllString(s, "<NUM>")
+}
+
+// looksLikeFilename returns true, if the given string could be a filename.
+func looksLikeFilename(s string) bool {
+	if len(strings.Fields(s)) > 1 {
+		return false
+	}
+	return PatFilenameLike.MatchString(s)
+}
+
+// mapString applies a function on each element of a string slice.
+func mapString(f func(string) string, vs []string) (result []string) {
+	for _, v := range vs {
+		result = append(result, f(v))
+	}
+	return result
+}
+
+// absInt returns the absolute value of an int.
+func absInt(v int) int {
+	if v < 0 {
+		return -v
+	}
+	return v
+}
+
+// slugifyString is a basic string slugifier.
+func slugifyString(s string) string {
+	var buf bytes.Buffer
+	for _, c := range strings.TrimSpace(strings.ToLower(s)) {
+		if (c > 96 && c < 123) || (c > 47 && c < 58) || (c == 32) || (c == 9) || (c == 10) {
+			fmt.Fprintf(&buf, "%c", c)
+		}
+	}
+	return strings.Join(strings.Fields(buf.String()), " ")
+}
+
+// looksLikeComponent returns true, if either a looks like a component of b, or vice versa.
+func looksLikeComponent(a, b string) bool {
+	ac := strings.Split(a, ".")
+	bc := strings.Split(b, ".")
+	if len(ac) > 1 {
+		if strings.Join(ac[0:len(ac)-1], ".") == b {
+			return true
+		}
+	}
+	if len(bc) > 1 {
+		if strings.Join(bc[0:len(bc)-1], ".") == a {
+			return true
+		}
+	}
+	return false
+}