aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/reduce.go19
-rw-r--r--skate/schema.go7
2 files changed, 18 insertions, 8 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index 7d789f5..b28d976 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -155,9 +155,10 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
var (
- target *Release
- wiki *MinimalCitations
- err error
+ target *Release
+ wiki *MinimalCitations
+ key, lang, encodedPage string
+ err error
)
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
@@ -176,16 +177,20 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
var bref BiblioRef
// We use lowercase base32 w/o padding of the original
// PageTitle as component for the id. XXX: ok for now?
- key := fmt.Sprintf("wikipedia_%s_%s",
- strings.ToLower(b32enc.EncodeToString([]byte(wiki.PageTitle))),
- target.Ident)
+ if wiki.Language == "" {
+ lang = "en"
+ } else {
+ lang = wiki.Language
+ }
+ encodedPage = strings.ToLower(b32enc.EncodeToString([]byte(lang + ":" + wiki.PageTitle)))
+ key = fmt.Sprintf("wikipedia_%s_%s", encodedPage, target.Ident)
if seen.Contains(key) {
continue
}
seen.Add(key)
bref.Key = key
// XXX: We currently only use "en" subset.
- bref.SourceWikipediaArticle = fmt.Sprintf("en:%s", wiki.PageTitle)
+ bref.SourceWikipediaArticle = fmt.Sprintf("%s:%s", lang, wiki.PageTitle)
bref.TargetReleaseIdent = target.Ident
bref.TargetWorkIdent = target.WorkID
bref.MatchProvenance = "wikipedia"
diff --git a/skate/schema.go b/skate/schema.go
index 0fd429f..dd088c2 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -501,12 +501,17 @@ func (rc *ReleaseCluster) OneNonRef() (*Release, error) {
return nil, fmt.Errorf("no reference/release found for cluster key: %v", rc.Key)
}
-// MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14.
+// MinimalCitations variant from
+// archive.org/details/wikipedia_citations_2020-07-14. Part of the naming was
+// already inconsistent.
type MinimalCitations struct {
IDList string `json:"ID_list"`
PageTitle string `json:"page_title"`
Title string `json:"Title"`
TypeOfCitation string `json:"type_of_citation"`
+ // We may have other languages in the future. If this is non, "en" might be
+ // used as default.
+ Language string `json:"lang"`
}
// IDList with commonly used identifier from wikipedia citations.