From d55dbd5b9ef8a1c4963d73e33bf76313175fcc30 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 15 Jul 2021 18:41:59 +0200 Subject: add ZippyWayback reducer --- skate/reduce.go | 60 +++++++++++++++++++++++++++- skate/verify.go | 5 ++- skate/verify_string.go | 103 +++++++++++++++++++++++++------------------------ 3 files changed, 114 insertions(+), 54 deletions(-) diff --git a/skate/reduce.go b/skate/reduce.go index df96076..8658ffe 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -359,6 +359,59 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { return zipper.Run() } +// ZippyWayback takes a (url, ref) reader and a (url, cdx) reader and will +// write a bref document for each match. +func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { + var ( + enc = json.NewEncoder(xio.NewSingleWriter(w)) + keyer = makeKeyFunc("\t", 1) + grouper = func(g *zipkey.Group) error { + var ( + ref *Ref + cdx *cdxSummary + err error + ) + // We take a single item from refs. + if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil { + return err + } + if cdx, err = parseCdxSummary(Cut(g.G1[0], 2)); err != nil { + return err + } + var bref BiblioRef + bref.SourceReleaseIdent = ref.ReleaseIdent + bref.SourceWorkIdent = ref.WorkIdent + bref.SourceReleaseStage = ref.ReleaseStage + bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) + bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) + bref.RefKey = ref.Key + if cdx.NumRows == 0 { + bref.TargetURL = cdx.Line + } else { + if cdx.Summary.Ok == "" { + bref.TargetURL = cdx.Line + } else { + // TODO: This would be better, if we only add a wayback + // link, if live web fails. For that we would need a full + // check of the URLs on the live web. + bref.TargetURL = fmt.Sprintf("https://web.archive.org/web/%s/%s", + cdx.Summary.Ok, cdx.Line) + } + } + bref.MatchStatus = StatusExact.Short() + bref.MatchReason = ReasonURLMatch.Short() + if err := enc.Encode(bref); err != nil { + return err + } + return nil + } + batcher = zipkey.NewBatcher(grouper) + ) + defer batcher.Close() + zipper := zipkey.New(refs, cdx, keyer, batcher.GroupFunc) + return zipper.Run() +} + // ZippyBrefAugment takes all matched docs from bref and adds docs from raw // refs, which have not been matched. It also gets rid of duplicate matches. // Note: This operates on two streams: raw refs with about 2.5B (07/2021) and @@ -661,7 +714,12 @@ func parseWiki(s string) (r *MinimalCitations, err error) { return } -func parseBiblioref(s string) (r *BiblioRef, err error) { +func parseBiblioRef(s string) (r *BiblioRef, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func parseCdxSummary(s string) (r *cdxSummary, err error) { err = json.Unmarshal([]byte(s), &r) return } diff --git a/skate/verify.go b/skate/verify.go index db3a925..5cb56bb 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -35,8 +35,7 @@ const ( StatusAmbiguous StatusUnmatched - ReasonUnknown Reason = iota - ReasonAppendix + ReasonAppendix Reason = iota ReasonArxiv ReasonArxivVersion ReasonBlacklisted @@ -81,6 +80,8 @@ const ( ReasonTitleAuthorMatch ReasonTitleFilename ReasonTokenizedAuthors + ReasonURLMatch + ReasonUnknown ReasonVersionedDOI ReasonWorkID ReasonYear diff --git a/skate/verify_string.go b/skate/verify_string.go index 13b1f82..4531a39 100644 --- a/skate/verify_string.go +++ b/skate/verify_string.go @@ -31,60 +31,61 @@ func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} - _ = x[ReasonUnknown-7] - _ = x[ReasonAppendix-8] - _ = x[ReasonArxiv-9] - _ = x[ReasonArxivVersion-10] - _ = x[ReasonBlacklisted-11] - _ = x[ReasonBlacklistedFragment-12] - _ = x[ReasonBookChapter-13] - _ = x[ReasonChemFormula-14] - _ = x[ReasonComponent-15] - _ = x[ReasonContainer-16] - _ = x[ReasonContainerNameBlacklist-17] - _ = x[ReasonContribIntersectionEmpty-18] - _ = x[ReasonCustomBSISubdoc-19] - _ = x[ReasonCustomBSIUndated-20] - _ = x[ReasonCustomIEEEArxiv-21] - _ = x[ReasonCustomIOPMAPattern-22] - _ = x[ReasonCustomPrefix1014288-23] - _ = x[ReasonCustomPrefix105860ChoiceReview-24] - _ = x[ReasonCustomPrefix107916-25] - _ = x[ReasonCustomVHS-26] - _ = x[ReasonDOI-27] - _ = x[ReasonDataciteRelatedID-28] - _ = x[ReasonDataciteVersion-29] - _ = x[ReasonDatasetDOI-30] - _ = x[ReasonFigshareVersion-31] - _ = x[ReasonISBN-32] - _ = x[ReasonJaccardAuthors-33] - _ = x[ReasonJstorID-34] - _ = x[ReasonMaxClusterSizeExceeded-35] - _ = x[ReasonNumDiff-36] - _ = x[ReasonPMCID-37] - _ = x[ReasonPMID-38] - _ = x[ReasonPMIDDOIPair-39] - _ = x[ReasonPageCount-40] - _ = x[ReasonPreprintPublished-41] - _ = x[ReasonPublisherBlacklist-42] - _ = x[ReasonReleaseType-43] - _ = x[ReasonSharedDOIPrefix-44] - _ = x[ReasonShortTitle-45] - _ = x[ReasonSingularCluster-46] - _ = x[ReasonSlugTitleAuthorMatch-47] - _ = x[ReasonSubtitle-48] - _ = x[ReasonTitleArtifact-49] - _ = x[ReasonTitleAuthorMatch-50] - _ = x[ReasonTitleFilename-51] - _ = x[ReasonTokenizedAuthors-52] - _ = x[ReasonVersionedDOI-53] - _ = x[ReasonWorkID-54] - _ = x[ReasonYear-55] + _ = x[ReasonAppendix-7] + _ = x[ReasonArxiv-8] + _ = x[ReasonArxivVersion-9] + _ = x[ReasonBlacklisted-10] + _ = x[ReasonBlacklistedFragment-11] + _ = x[ReasonBookChapter-12] + _ = x[ReasonChemFormula-13] + _ = x[ReasonComponent-14] + _ = x[ReasonContainer-15] + _ = x[ReasonContainerNameBlacklist-16] + _ = x[ReasonContribIntersectionEmpty-17] + _ = x[ReasonCustomBSISubdoc-18] + _ = x[ReasonCustomBSIUndated-19] + _ = x[ReasonCustomIEEEArxiv-20] + _ = x[ReasonCustomIOPMAPattern-21] + _ = x[ReasonCustomPrefix1014288-22] + _ = x[ReasonCustomPrefix105860ChoiceReview-23] + _ = x[ReasonCustomPrefix107916-24] + _ = x[ReasonCustomVHS-25] + _ = x[ReasonDOI-26] + _ = x[ReasonDataciteRelatedID-27] + _ = x[ReasonDataciteVersion-28] + _ = x[ReasonDatasetDOI-29] + _ = x[ReasonFigshareVersion-30] + _ = x[ReasonISBN-31] + _ = x[ReasonJaccardAuthors-32] + _ = x[ReasonJstorID-33] + _ = x[ReasonMaxClusterSizeExceeded-34] + _ = x[ReasonNumDiff-35] + _ = x[ReasonPMCID-36] + _ = x[ReasonPMID-37] + _ = x[ReasonPMIDDOIPair-38] + _ = x[ReasonPageCount-39] + _ = x[ReasonPreprintPublished-40] + _ = x[ReasonPublisherBlacklist-41] + _ = x[ReasonReleaseType-42] + _ = x[ReasonSharedDOIPrefix-43] + _ = x[ReasonShortTitle-44] + _ = x[ReasonSingularCluster-45] + _ = x[ReasonSlugTitleAuthorMatch-46] + _ = x[ReasonSubtitle-47] + _ = x[ReasonTitleArtifact-48] + _ = x[ReasonTitleAuthorMatch-49] + _ = x[ReasonTitleFilename-50] + _ = x[ReasonTokenizedAuthors-51] + _ = x[ReasonURLMatch-52] + _ = x[ReasonUnknown-53] + _ = x[ReasonVersionedDOI-54] + _ = x[ReasonWorkID-55] + _ = x[ReasonYear-56] } -const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonVersionedDOIReasonWorkIDReasonYear" +const _Reason_name = "ReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonURLMatchReasonUnknownReasonVersionedDOIReasonWorkIDReasonYear" -var _Reason_index = [...]uint16{0, 13, 27, 38, 56, 73, 98, 115, 132, 147, 162, 190, 220, 241, 263, 284, 308, 333, 369, 393, 408, 417, 440, 461, 477, 498, 508, 528, 541, 569, 582, 593, 603, 620, 635, 658, 682, 699, 720, 736, 757, 783, 797, 816, 838, 857, 879, 897, 909, 919} +var _Reason_index = [...]uint16{0, 14, 25, 43, 60, 85, 102, 119, 134, 149, 177, 207, 228, 250, 271, 295, 320, 356, 380, 395, 404, 427, 448, 464, 485, 495, 515, 528, 556, 569, 580, 590, 607, 622, 645, 669, 686, 707, 723, 744, 770, 784, 803, 825, 844, 866, 880, 893, 911, 923, 933} func (i Reason) String() string { i -= 7 -- cgit v1.2.3