aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/reduce.go60
-rw-r--r--skate/verify.go5
-rw-r--r--skate/verify_string.go103
3 files changed, 114 insertions, 54 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index df96076..8658ffe 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -359,6 +359,59 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
return zipper.Run()
}
+// ZippyWayback takes a (url, ref) reader and a (url, cdx) reader and will
+// write a bref document for each match.
+func ZippyWayback(refs, cdx io.Reader, w io.Writer) error {
+ var (
+ enc = json.NewEncoder(xio.NewSingleWriter(w))
+ keyer = makeKeyFunc("\t", 1)
+ grouper = func(g *zipkey.Group) error {
+ var (
+ ref *Ref
+ cdx *cdxSummary
+ err error
+ )
+ // We take a single item from refs.
+ if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil {
+ return err
+ }
+ if cdx, err = parseCdxSummary(Cut(g.G1[0], 2)); err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.SourceReleaseIdent = ref.ReleaseIdent
+ bref.SourceWorkIdent = ref.WorkIdent
+ bref.SourceReleaseStage = ref.ReleaseStage
+ bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
+ bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
+ bref.RefKey = ref.Key
+ if cdx.NumRows == 0 {
+ bref.TargetURL = cdx.Line
+ } else {
+ if cdx.Summary.Ok == "" {
+ bref.TargetURL = cdx.Line
+ } else {
+ // TODO: This would be better, if we only add a wayback
+ // link, if live web fails. For that we would need a full
+ // check of the URLs on the live web.
+ bref.TargetURL = fmt.Sprintf("https://web.archive.org/web/%s/%s",
+ cdx.Summary.Ok, cdx.Line)
+ }
+ }
+ bref.MatchStatus = StatusExact.Short()
+ bref.MatchReason = ReasonURLMatch.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ return nil
+ }
+ batcher = zipkey.NewBatcher(grouper)
+ )
+ defer batcher.Close()
+ zipper := zipkey.New(refs, cdx, keyer, batcher.GroupFunc)
+ return zipper.Run()
+}
+
// ZippyBrefAugment takes all matched docs from bref and adds docs from raw
// refs, which have not been matched. It also gets rid of duplicate matches.
// Note: This operates on two streams: raw refs with about 2.5B (07/2021) and
@@ -661,7 +714,12 @@ func parseWiki(s string) (r *MinimalCitations, err error) {
return
}
-func parseBiblioref(s string) (r *BiblioRef, err error) {
+func parseBiblioRef(s string) (r *BiblioRef, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+func parseCdxSummary(s string) (r *cdxSummary, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}
diff --git a/skate/verify.go b/skate/verify.go
index db3a925..5cb56bb 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -35,8 +35,7 @@ const (
StatusAmbiguous
StatusUnmatched
- ReasonUnknown Reason = iota
- ReasonAppendix
+ ReasonAppendix Reason = iota
ReasonArxiv
ReasonArxivVersion
ReasonBlacklisted
@@ -81,6 +80,8 @@ const (
ReasonTitleAuthorMatch
ReasonTitleFilename
ReasonTokenizedAuthors
+ ReasonURLMatch
+ ReasonUnknown
ReasonVersionedDOI
ReasonWorkID
ReasonYear
diff --git a/skate/verify_string.go b/skate/verify_string.go
index 13b1f82..4531a39 100644
--- a/skate/verify_string.go
+++ b/skate/verify_string.go
@@ -31,60 +31,61 @@ func _() {
// An "invalid array index" compiler error signifies that the constant values have changed.
// Re-run the stringer command to generate them again.
var x [1]struct{}
- _ = x[ReasonUnknown-7]
- _ = x[ReasonAppendix-8]
- _ = x[ReasonArxiv-9]
- _ = x[ReasonArxivVersion-10]
- _ = x[ReasonBlacklisted-11]
- _ = x[ReasonBlacklistedFragment-12]
- _ = x[ReasonBookChapter-13]
- _ = x[ReasonChemFormula-14]
- _ = x[ReasonComponent-15]
- _ = x[ReasonContainer-16]
- _ = x[ReasonContainerNameBlacklist-17]
- _ = x[ReasonContribIntersectionEmpty-18]
- _ = x[ReasonCustomBSISubdoc-19]
- _ = x[ReasonCustomBSIUndated-20]
- _ = x[ReasonCustomIEEEArxiv-21]
- _ = x[ReasonCustomIOPMAPattern-22]
- _ = x[ReasonCustomPrefix1014288-23]
- _ = x[ReasonCustomPrefix105860ChoiceReview-24]
- _ = x[ReasonCustomPrefix107916-25]
- _ = x[ReasonCustomVHS-26]
- _ = x[ReasonDOI-27]
- _ = x[ReasonDataciteRelatedID-28]
- _ = x[ReasonDataciteVersion-29]
- _ = x[ReasonDatasetDOI-30]
- _ = x[ReasonFigshareVersion-31]
- _ = x[ReasonISBN-32]
- _ = x[ReasonJaccardAuthors-33]
- _ = x[ReasonJstorID-34]
- _ = x[ReasonMaxClusterSizeExceeded-35]
- _ = x[ReasonNumDiff-36]
- _ = x[ReasonPMCID-37]
- _ = x[ReasonPMID-38]
- _ = x[ReasonPMIDDOIPair-39]
- _ = x[ReasonPageCount-40]
- _ = x[ReasonPreprintPublished-41]
- _ = x[ReasonPublisherBlacklist-42]
- _ = x[ReasonReleaseType-43]
- _ = x[ReasonSharedDOIPrefix-44]
- _ = x[ReasonShortTitle-45]
- _ = x[ReasonSingularCluster-46]
- _ = x[ReasonSlugTitleAuthorMatch-47]
- _ = x[ReasonSubtitle-48]
- _ = x[ReasonTitleArtifact-49]
- _ = x[ReasonTitleAuthorMatch-50]
- _ = x[ReasonTitleFilename-51]
- _ = x[ReasonTokenizedAuthors-52]
- _ = x[ReasonVersionedDOI-53]
- _ = x[ReasonWorkID-54]
- _ = x[ReasonYear-55]
+ _ = x[ReasonAppendix-7]
+ _ = x[ReasonArxiv-8]
+ _ = x[ReasonArxivVersion-9]
+ _ = x[ReasonBlacklisted-10]
+ _ = x[ReasonBlacklistedFragment-11]
+ _ = x[ReasonBookChapter-12]
+ _ = x[ReasonChemFormula-13]
+ _ = x[ReasonComponent-14]
+ _ = x[ReasonContainer-15]
+ _ = x[ReasonContainerNameBlacklist-16]
+ _ = x[ReasonContribIntersectionEmpty-17]
+ _ = x[ReasonCustomBSISubdoc-18]
+ _ = x[ReasonCustomBSIUndated-19]
+ _ = x[ReasonCustomIEEEArxiv-20]
+ _ = x[ReasonCustomIOPMAPattern-21]
+ _ = x[ReasonCustomPrefix1014288-22]
+ _ = x[ReasonCustomPrefix105860ChoiceReview-23]
+ _ = x[ReasonCustomPrefix107916-24]
+ _ = x[ReasonCustomVHS-25]
+ _ = x[ReasonDOI-26]
+ _ = x[ReasonDataciteRelatedID-27]
+ _ = x[ReasonDataciteVersion-28]
+ _ = x[ReasonDatasetDOI-29]
+ _ = x[ReasonFigshareVersion-30]
+ _ = x[ReasonISBN-31]
+ _ = x[ReasonJaccardAuthors-32]
+ _ = x[ReasonJstorID-33]
+ _ = x[ReasonMaxClusterSizeExceeded-34]
+ _ = x[ReasonNumDiff-35]
+ _ = x[ReasonPMCID-36]
+ _ = x[ReasonPMID-37]
+ _ = x[ReasonPMIDDOIPair-38]
+ _ = x[ReasonPageCount-39]
+ _ = x[ReasonPreprintPublished-40]
+ _ = x[ReasonPublisherBlacklist-41]
+ _ = x[ReasonReleaseType-42]
+ _ = x[ReasonSharedDOIPrefix-43]
+ _ = x[ReasonShortTitle-44]
+ _ = x[ReasonSingularCluster-45]
+ _ = x[ReasonSlugTitleAuthorMatch-46]
+ _ = x[ReasonSubtitle-47]
+ _ = x[ReasonTitleArtifact-48]
+ _ = x[ReasonTitleAuthorMatch-49]
+ _ = x[ReasonTitleFilename-50]
+ _ = x[ReasonTokenizedAuthors-51]
+ _ = x[ReasonURLMatch-52]
+ _ = x[ReasonUnknown-53]
+ _ = x[ReasonVersionedDOI-54]
+ _ = x[ReasonWorkID-55]
+ _ = x[ReasonYear-56]
}
-const _Reason_name = "ReasonUnknownReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonVersionedDOIReasonWorkIDReasonYear"
+const _Reason_name = "ReasonAppendixReasonArxivReasonArxivVersionReasonBlacklistedReasonBlacklistedFragmentReasonBookChapterReasonChemFormulaReasonComponentReasonContainerReasonContainerNameBlacklistReasonContribIntersectionEmptyReasonCustomBSISubdocReasonCustomBSIUndatedReasonCustomIEEEArxivReasonCustomIOPMAPatternReasonCustomPrefix1014288ReasonCustomPrefix105860ChoiceReviewReasonCustomPrefix107916ReasonCustomVHSReasonDOIReasonDataciteRelatedIDReasonDataciteVersionReasonDatasetDOIReasonFigshareVersionReasonISBNReasonJaccardAuthorsReasonJstorIDReasonMaxClusterSizeExceededReasonNumDiffReasonPMCIDReasonPMIDReasonPMIDDOIPairReasonPageCountReasonPreprintPublishedReasonPublisherBlacklistReasonReleaseTypeReasonSharedDOIPrefixReasonShortTitleReasonSingularClusterReasonSlugTitleAuthorMatchReasonSubtitleReasonTitleArtifactReasonTitleAuthorMatchReasonTitleFilenameReasonTokenizedAuthorsReasonURLMatchReasonUnknownReasonVersionedDOIReasonWorkIDReasonYear"
-var _Reason_index = [...]uint16{0, 13, 27, 38, 56, 73, 98, 115, 132, 147, 162, 190, 220, 241, 263, 284, 308, 333, 369, 393, 408, 417, 440, 461, 477, 498, 508, 528, 541, 569, 582, 593, 603, 620, 635, 658, 682, 699, 720, 736, 757, 783, 797, 816, 838, 857, 879, 897, 909, 919}
+var _Reason_index = [...]uint16{0, 14, 25, 43, 60, 85, 102, 119, 134, 149, 177, 207, 228, 250, 271, 295, 320, 356, 380, 395, 404, 427, 448, 464, 485, 495, 515, 528, 556, 569, 580, 590, 607, 622, 645, 669, 686, 707, 723, 744, 770, 784, 803, 825, 844, 866, 880, 893, 911, 923, 933}
func (i Reason) String() string {
i -= 7