diff options
-rw-r--r-- | skate/verify.go | 1 | ||||
-rw-r--r-- | skate/verify_string.go | 107 | ||||
-rw-r--r-- | skate/zippy.go | 48 |
3 files changed, 100 insertions, 56 deletions
diff --git a/skate/verify.go b/skate/verify.go index a656b1a..ee82fe5 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -33,6 +33,7 @@ const ( StatusWeak StatusDifferent StatusAmbiguous + StatusUnmatched ReasonUnknown Reason = iota ReasonAppendix diff --git a/skate/verify_string.go b/skate/verify_string.go index 53bbd30..13b1f82 100644 --- a/skate/verify_string.go +++ b/skate/verify_string.go @@ -14,11 +14,12 @@ func _() { _ = x[StatusWeak-3] _ = x[StatusDifferent-4] _ = x[StatusAmbiguous-5] + _ = x[StatusUnmatched-6] } -const _Status_name = "StatusUnknownStatusExactStatusStrongStatusWeakStatusDifferentStatusAmbiguous" +const _Status_name = "StatusUnknownStatusExactStatusStrongStatusWeakStatusDifferentStatusAmbiguousStatusUnmatched" -var _Status_index = [...]uint8{0, 13, 24, 36, 46, 61, 76} +var _Status_index = [...]uint8{0, 13, 24, 36, 46, 61, 76, 91} func (i Status) String() string { if i < 0 || i >= Status(len(_Status_index)-1) { @@ -30,55 +31,55 @@ func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} - _ = x[ReasonUnknown-6] - _ = x[ReasonAppendix-7] - _ = x[ReasonArxiv-8] - _ = x[ReasonArxivVersion-9] - _ = x[ReasonBlacklisted-10] - _ = x[ReasonBlacklistedFragment-11] - _ = x[ReasonBookChapter-12] - _ = x[ReasonChemFormula-13] - _ = x[ReasonComponent-14] - _ = x[ReasonContainer-15] - _ = x[ReasonContainerNameBlacklist-16] - _ = x[ReasonContribIntersectionEmpty-17] - _ = x[ReasonCustomBSISubdoc-18] - _ = x[ReasonCustomBSIUndated-19] - _ = x[ReasonCustomIEEEArxiv-20] - _ = x[ReasonCustomIOPMAPattern-21] - _ = x[ReasonCustomPrefix1014288-22] - _ = x[ReasonCustomPrefix105860ChoiceReview-23] - _ = x[ReasonCustomPrefix107916-24] - _ = x[ReasonCustomVHS-25] - _ = x[ReasonDOI-26] - _ = x[ReasonDataciteRelatedID-27] - _ = x[ReasonDataciteVersion-28] - _ = x[ReasonDatasetDOI-29] - _ = x[ReasonFigshareVersion-30] - _ = x[ReasonISBN-31] - _ = x[ReasonJaccardAuthors-32] - _ = x[ReasonJstorID-33] - _ = x[ReasonMaxClusterSizeExceeded-34] - _ = x[ReasonNumDiff-35] - _ = x[ReasonPMCID-36] - _ = x[ReasonPMID-37] - _ = x[ReasonPMIDDOIPair-38] - _ = x[ReasonPageCount-39] - _ = x[ReasonPreprintPublished-40] - _ = x[ReasonPublisherBlacklist-41] - _ = x[ReasonReleaseType-42] - _ = x[ReasonSharedDOIPrefix-43] - _ = x[ReasonShortTitle-44] - _ = x[ReasonSingularCluster-45] - _ = x[ReasonSlugTitleAuthorMatch-46] - _ = x[ReasonSubtitle-47] - _ = x[ReasonTitleArtifact-48] - _ = x[ReasonTitleAuthorMatch-49] - _ = x[ReasonTitleFilename-50] - _ = x[ReasonTokenizedAuthors-51] - _ = x[ReasonVersionedDOI-52] - _ = x[ReasonWorkID-53] - _ = x[ReasonYear-54] + _ = x[ReasonUnknown-7] + _ = x[ReasonAppendix-8] + _ = x[ReasonArxiv-9] + _ = x[ReasonArxivVersion-10] + _ = x[ReasonBlacklisted-11] + _ = x[ReasonBlacklistedFragment-12] + _ = x[ReasonBookChapter-13] + _ = x[ReasonChemFormula-14] + _ = x[ReasonComponent-15] + _ = x[ReasonContainer-16] + _ = x[ReasonContainerNameBlacklist-17] + _ = x[ReasonContribIntersectionEmpty-18] + _ = x[ReasonCustomBSISubdoc-19] + _ = x[ReasonCustomBSIUndated-20] + _ = x[ReasonCustomIEEEArxiv-21] + _ = x[ReasonCustomIOPMAPattern-22] + _ = x[ReasonCustomPrefix1014288-23] + _ = x[ReasonCustomPrefix105860ChoiceReview-24] + _ = x[ReasonCustomPrefix107916-25] + _ = x[ReasonCustomVHS-26] + _ = x[ReasonDOI-27] + _ = x[ReasonDataciteRelatedID-28] + _ = x[ReasonDataciteVersion-29] + _ = x[ReasonDatasetDOI-30] + _ = x[ReasonFigshareVersion-31] + _ = x[ReasonISBN-32] + _ = x[ReasonJaccardAuthors-33] + _ = x[ReasonJstorID-34] + _ = x[ReasonMaxClusterSizeExceeded-35] + _ = x[ReasonNumDiff-36] + _ = x[ReasonPMCID-37] + _ = x[ReasonPMID-38] + _ = x[ReasonPMIDDOIPair-39] + _ = x[ReasonPageCount-40] + _ = x[ReasonPreprintPublished-41] + _ = x[ReasonPublisherBlacklist-42] + _ = x[ReasonReleaseType-43] + _ = x[ReasonSharedDOIPrefix-44] + _ = x[ReasonShortTitle-45] + _ = x[ReasonSingularCluster-46] + _ = x[ReasonSlugTitleAuthorMatch-47] + _ = x[ReasonSubtitle-48] + _ = x[ReasonTitleArtifact-49] + _ = x[ReasonTitleAuthorMatch-50] + _ = x[ReasonTitleFilename-51] + _ = x[ReasonTokenizedAuthors-52] + _ = x[ReasonVersionedDOI-53] + _ = x[ReasonWorkID-54] + _ = x[ReasonYear-55] } const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonVersionedDOIReasonWorkIDReasonYear" @@ -86,9 +87,9 @@ const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionRe var _Reason_index = [...]uint16{0, 13, 27, 38, 56, 73, 98, 115, 132, 147, 162, 190, 220, 241, 263, 284, 308, 333, 369, 393, 408, 417, 440, 461, 477, 498, 508, 528, 541, 569, 582, 593, 603, 620, 635, 658, 682, 699, 720, 736, 757, 783, 797, 816, 838, 857, 879, 897, 909, 919} func (i Reason) String() string { - i -= 6 + i -= 7 if i < 0 || i >= Reason(len(_Reason_index)-1) { - return "Reason(" + strconv.FormatInt(int64(i+6), 10) + ")" + return "Reason(" + strconv.FormatInt(int64(i+7), 10) + ")" } return _Reason_name[_Reason_index[i]:_Reason_index[i+1]] } diff --git a/skate/zippy.go b/skate/zippy.go index c949069..7aff7a6 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -319,7 +319,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { // We can identify, which docs have been matched by checking the ref key and index. func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { var ( - _ = json.NewEncoder(w) + enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { // g.G0 contains a matched docs for a given work id, g.G1 all raw @@ -327,13 +327,26 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { // First, iterate over all matches and sort out duplicates, e.g. // docs that have the same source and target id. - unique, err := uniqueMatches(g.G0) + matched, err := uniqueMatches(g.G0) if err != nil { return err } + var refs = make([]*Ref, len(g.G1)) + for i := 0; i < len(refs); i++ { + var ref Ref + if err := json.Unmarshal([]byte(g.G1[i]), &ref); err != nil { + return err + } + refs[i] = &ref + } + matchedRefsExtend(matched, refs) + for _, bref := range matched { + if err := enc.Encode(bref); err != nil { + return err + } + } // We want to find all items in g.G1, which are not in unique. This // is a set like operation, but we want a custom comparator. - log.Println(unique) return nil } @@ -342,6 +355,35 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { return zipper.Run() } +// matchedRefsExtend takes a set of (unique) biblioref docs and will emit that +// set of biblioref docs (unchanged) plus raw references as biblioref, which +// did not result in a match (determined by ref key and index). +func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) { + s := set.New() // store key + index of matched items + for _, m := range matched { + s.Add(m.Key + fmt.Sprintf("%d", m.RefIndex)) + } + for _, r := range refs { + if s.Contains(r.Key + fmt.Sprintf("%d", r.Index)) { + continue + } + var bref BiblioRef + bref.Key = fmt.Sprintf("%s_%d", r.ReleaseIdent, r.Index) + bref.RefIndex = r.Index + bref.RefKey = r.Key + bref.SourceReleaseIdent = r.ReleaseIdent + bref.SourceReleaseStage = r.ReleaseStage + bref.SourceWorkIdent = r.WorkIdent + bref.SourceYear = fmt.Sprintf("%d", r.ReleaseYear) + bref.TargetUnstructured = r.Biblio.Unstructured + // Reuse fields for debugging, for now. + bref.MatchStatus = StatusUnmatched.Short() + bref.MatchReason = ReasonUnknown.Short() + matched = append(matched, &bref) + } + return +} + // uniqueMatches takes a list of bref docs (unserialized) and will return a // list of serialized bref docs, containing unique matches. func uniqueMatches(docs []string) (result []*BiblioRef, err error) { |