diff options
-rw-r--r-- | skate/reduce.go | 19 | ||||
-rw-r--r-- | skate/schema.go | 7 |
2 files changed, 18 insertions, 8 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index 7d789f5..b28d976 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -155,9 +155,10 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { var ( - target *Release - wiki *MinimalCitations - err error + target *Release + wiki *MinimalCitations + key, lang, encodedPage string + err error ) if len(g.G0) == 0 || len(g.G1) == 0 { return nil @@ -176,16 +177,20 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error var bref BiblioRef // We use lowercase base32 w/o padding of the original // PageTitle as component for the id. XXX: ok for now? - key := fmt.Sprintf("wikipedia_%s_%s", - strings.ToLower(b32enc.EncodeToString([]byte(wiki.PageTitle))), - target.Ident) + if wiki.Language == "" { + lang = "en" + } else { + lang = wiki.Language + } + encodedPage = strings.ToLower(b32enc.EncodeToString([]byte(lang + ":" + wiki.PageTitle))) + key = fmt.Sprintf("wikipedia_%s_%s", encodedPage, target.Ident) if seen.Contains(key) { continue } seen.Add(key) bref.Key = key // XXX: We currently only use "en" subset. - bref.SourceWikipediaArticle = fmt.Sprintf("en:%s", wiki.PageTitle) + bref.SourceWikipediaArticle = fmt.Sprintf("%s:%s", lang, wiki.PageTitle) bref.TargetReleaseIdent = target.Ident bref.TargetWorkIdent = target.WorkID bref.MatchProvenance = "wikipedia" diff --git a/skate/schema.go b/skate/schema.go index 0fd429f..dd088c2 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -501,12 +501,17 @@ func (rc *ReleaseCluster) OneNonRef() (*Release, error) { return nil, fmt.Errorf("no reference/release found for cluster key: %v", rc.Key) } -// MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14. +// MinimalCitations variant from +// archive.org/details/wikipedia_citations_2020-07-14. Part of the naming was +// already inconsistent. type MinimalCitations struct { IDList string `json:"ID_list"` PageTitle string `json:"page_title"` Title string `json:"Title"` TypeOfCitation string `json:"type_of_citation"` + // We may have other languages in the future. If this is non, "en" might be + // used as default. + Language string `json:"lang"` } // IDList with commonly used identifier from wikipedia citations. |