aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/verify.go1
-rw-r--r--skate/verify_string.go107
-rw-r--r--skate/zippy.go48
3 files changed, 100 insertions, 56 deletions
diff --git a/skate/verify.go b/skate/verify.go
index a656b1a..ee82fe5 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -33,6 +33,7 @@ const (
StatusWeak
StatusDifferent
StatusAmbiguous
+ StatusUnmatched
ReasonUnknown Reason = iota
ReasonAppendix
diff --git a/skate/verify_string.go b/skate/verify_string.go
index 53bbd30..13b1f82 100644
--- a/skate/verify_string.go
+++ b/skate/verify_string.go
@@ -14,11 +14,12 @@ func _() {
_ = x[StatusWeak-3]
_ = x[StatusDifferent-4]
_ = x[StatusAmbiguous-5]
+ _ = x[StatusUnmatched-6]
}
-const _Status_name = "StatusUnknownStatusExactStatusStrongStatusWeakStatusDifferentStatusAmbiguous"
+const _Status_name = "StatusUnknownStatusExactStatusStrongStatusWeakStatusDifferentStatusAmbiguousStatusUnmatched"
-var _Status_index = [...]uint8{0, 13, 24, 36, 46, 61, 76}
+var _Status_index = [...]uint8{0, 13, 24, 36, 46, 61, 76, 91}
func (i Status) String() string {
if i < 0 || i >= Status(len(_Status_index)-1) {
@@ -30,55 +31,55 @@ func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
- _ = x[ReasonUnknown-6]
- _ = x[ReasonAppendix-7]
- _ = x[ReasonArxiv-8]
- _ = x[ReasonArxivVersion-9]
- _ = x[ReasonBlacklisted-10]
- _ = x[ReasonBlacklistedFragment-11]
- _ = x[ReasonBookChapter-12]
- _ = x[ReasonChemFormula-13]
- _ = x[ReasonComponent-14]
- _ = x[ReasonContainer-15]
- _ = x[ReasonContainerNameBlacklist-16]
- _ = x[ReasonContribIntersectionEmpty-17]
- _ = x[ReasonCustomBSISubdoc-18]
- _ = x[ReasonCustomBSIUndated-19]
- _ = x[ReasonCustomIEEEArxiv-20]
- _ = x[ReasonCustomIOPMAPattern-21]
- _ = x[ReasonCustomPrefix1014288-22]
- _ = x[ReasonCustomPrefix105860ChoiceReview-23]
- _ = x[ReasonCustomPrefix107916-24]
- _ = x[ReasonCustomVHS-25]
- _ = x[ReasonDOI-26]
- _ = x[ReasonDataciteRelatedID-27]
- _ = x[ReasonDataciteVersion-28]
- _ = x[ReasonDatasetDOI-29]
- _ = x[ReasonFigshareVersion-30]
- _ = x[ReasonISBN-31]
- _ = x[ReasonJaccardAuthors-32]
- _ = x[ReasonJstorID-33]
- _ = x[ReasonMaxClusterSizeExceeded-34]
- _ = x[ReasonNumDiff-35]
- _ = x[ReasonPMCID-36]
- _ = x[ReasonPMID-37]
- _ = x[ReasonPMIDDOIPair-38]
- _ = x[ReasonPageCount-39]
- _ = x[ReasonPreprintPublished-40]
- _ = x[ReasonPublisherBlacklist-41]
- _ = x[ReasonReleaseType-42]
- _ = x[ReasonSharedDOIPrefix-43]
- _ = x[ReasonShortTitle-44]
- _ = x[ReasonSingularCluster-45]
- _ = x[ReasonSlugTitleAuthorMatch-46]
- _ = x[ReasonSubtitle-47]
- _ = x[ReasonTitleArtifact-48]
- _ = x[ReasonTitleAuthorMatch-49]
- _ = x[ReasonTitleFilename-50]
- _ = x[ReasonTokenizedAuthors-51]
- _ = x[ReasonVersionedDOI-52]
- _ = x[ReasonWorkID-53]
- _ = x[ReasonYear-54]
+ _ = x[ReasonUnknown-7]
+ _ = x[ReasonAppendix-8]
+ _ = x[ReasonArxiv-9]
+ _ = x[ReasonArxivVersion-10]
+ _ = x[ReasonBlacklisted-11]
+ _ = x[ReasonBlacklistedFragment-12]
+ _ = x[ReasonBookChapter-13]
+ _ = x[ReasonChemFormula-14]
+ _ = x[ReasonComponent-15]
+ _ = x[ReasonContainer-16]
+ _ = x[ReasonContainerNameBlacklist-17]
+ _ = x[ReasonContribIntersectionEmpty-18]
+ _ = x[ReasonCustomBSISubdoc-19]
+ _ = x[ReasonCustomBSIUndated-20]
+ _ = x[ReasonCustomIEEEArxiv-21]
+ _ = x[ReasonCustomIOPMAPattern-22]
+ _ = x[ReasonCustomPrefix1014288-23]
+ _ = x[ReasonCustomPrefix105860ChoiceReview-24]
+ _ = x[ReasonCustomPrefix107916-25]
+ _ = x[ReasonCustomVHS-26]
+ _ = x[ReasonDOI-27]
+ _ = x[ReasonDataciteRelatedID-28]
+ _ = x[ReasonDataciteVersion-29]
+ _ = x[ReasonDatasetDOI-30]
+ _ = x[ReasonFigshareVersion-31]
+ _ = x[ReasonISBN-32]
+ _ = x[ReasonJaccardAuthors-33]
+ _ = x[ReasonJstorID-34]
+ _ = x[ReasonMaxClusterSizeExceeded-35]
+ _ = x[ReasonNumDiff-36]
+ _ = x[ReasonPMCID-37]
+ _ = x[ReasonPMID-38]
+ _ = x[ReasonPMIDDOIPair-39]
+ _ = x[ReasonPageCount-40]
+ _ = x[ReasonPreprintPublished-41]
+ _ = x[ReasonPublisherBlacklist-42]
+ _ = x[ReasonReleaseType-43]
+ _ = x[ReasonSharedDOIPrefix-44]
+ _ = x[ReasonShortTitle-45]
+ _ = x[ReasonSingularCluster-46]
+ _ = x[ReasonSlugTitleAuthorMatch-47]
+ _ = x[ReasonSubtitle-48]
+ _ = x[ReasonTitleArtifact-49]
+ _ = x[ReasonTitleAuthorMatch-50]
+ _ = x[ReasonTitleFilename-51]
+ _ = x[ReasonTokenizedAuthors-52]
+ _ = x[ReasonVersionedDOI-53]
+ _ = x[ReasonWorkID-54]
+ _ = x[ReasonYear-55]
}
const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonVersionedDOIReasonWorkIDReasonYear"
@@ -86,9 +87,9 @@ const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionRe
var _Reason_index = [...]uint16{0, 13, 27, 38, 56, 73, 98, 115, 132, 147, 162, 190, 220, 241, 263, 284, 308, 333, 369, 393, 408, 417, 440, 461, 477, 498, 508, 528, 541, 569, 582, 593, 603, 620, 635, 658, 682, 699, 720, 736, 757, 783, 797, 816, 838, 857, 879, 897, 909, 919}
func (i Reason) String() string {
- i -= 6
+ i -= 7
if i < 0 || i >= Reason(len(_Reason_index)-1) {
- return "Reason(" + strconv.FormatInt(int64(i+6), 10) + ")"
+ return "Reason(" + strconv.FormatInt(int64(i+7), 10) + ")"
}
return _Reason_name[_Reason_index[i]:_Reason_index[i+1]]
}
diff --git a/skate/zippy.go b/skate/zippy.go
index c949069..7aff7a6 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -319,7 +319,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
// We can identify, which docs have been matched by checking the ref key and index.
func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
var (
- _ = json.NewEncoder(w)
+ enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
// g.G0 contains a matched docs for a given work id, g.G1 all raw
@@ -327,13 +327,26 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
// First, iterate over all matches and sort out duplicates, e.g.
// docs that have the same source and target id.
- unique, err := uniqueMatches(g.G0)
+ matched, err := uniqueMatches(g.G0)
if err != nil {
return err
}
+ var refs = make([]*Ref, len(g.G1))
+ for i := 0; i < len(refs); i++ {
+ var ref Ref
+ if err := json.Unmarshal([]byte(g.G1[i]), &ref); err != nil {
+ return err
+ }
+ refs[i] = &ref
+ }
+ matchedRefsExtend(matched, refs)
+ for _, bref := range matched {
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
// We want to find all items in g.G1, which are not in unique. This
// is a set like operation, but we want a custom comparator.
- log.Println(unique)
return nil
}
@@ -342,6 +355,35 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
return zipper.Run()
}
+// matchedRefsExtend takes a set of (unique) biblioref docs and will emit that
+// set of biblioref docs (unchanged) plus raw references as biblioref, which
+// did not result in a match (determined by ref key and index).
+func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) {
+ s := set.New() // store key + index of matched items
+ for _, m := range matched {
+ s.Add(m.Key + fmt.Sprintf("%d", m.RefIndex))
+ }
+ for _, r := range refs {
+ if s.Contains(r.Key + fmt.Sprintf("%d", r.Index)) {
+ continue
+ }
+ var bref BiblioRef
+ bref.Key = fmt.Sprintf("%s_%d", r.ReleaseIdent, r.Index)
+ bref.RefIndex = r.Index
+ bref.RefKey = r.Key
+ bref.SourceReleaseIdent = r.ReleaseIdent
+ bref.SourceReleaseStage = r.ReleaseStage
+ bref.SourceWorkIdent = r.WorkIdent
+ bref.SourceYear = fmt.Sprintf("%d", r.ReleaseYear)
+ bref.TargetUnstructured = r.Biblio.Unstructured
+ // Reuse fields for debugging, for now.
+ bref.MatchStatus = StatusUnmatched.Short()
+ bref.MatchReason = ReasonUnknown.Short()
+ matched = append(matched, &bref)
+ }
+ return
+}
+
// uniqueMatches takes a list of bref docs (unserialized) and will return a
// list of serialized bref docs, containing unique matches.
func uniqueMatches(docs []string) (result []*BiblioRef, err error) {