diff options
-rw-r--r-- | skate/cmd/skate-resolve-journal-name/main.go | 34 | ||||
-rw-r--r-- | skate/url_test.go | 8 |
2 files changed, 25 insertions, 17 deletions
diff --git a/skate/cmd/skate-resolve-journal-name/main.go b/skate/cmd/skate-resolve-journal-name/main.go index 0bd4866..8929d6c 100644 --- a/skate/cmd/skate-resolve-journal-name/main.go +++ b/skate/cmd/skate-resolve-journal-name/main.go @@ -32,7 +32,7 @@ var ( column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)") sep = flag.String("d", "\t", "delimiter to use") bestEffort = flag.Bool("B", false, "only log errors, do not halt") - abbrevFile = flag.String("A", "", "path to abbreviate file") + abbrevFile = flag.String("A", "", "path to abbreviations file") keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation") ) @@ -102,13 +102,9 @@ func main() { } } -type NameAbbrev struct { - Name string `json:"name"` - Abbrev string `json:"abbrev"` -} - // createAbbreviationMatchSet creates a match set from JSONL abbreviations -// file. It also returns a mapping from abbreviations to full journal names for that abbreviation. +// file. It also returns a mapping from abbreviations to full journal names for +// that abbreviation. func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) { f, err := os.Open(filename) if err != nil { @@ -116,11 +112,15 @@ func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][] } defer f.Close() var ( - sm = make(map[string]set.Set) - m = make(map[string][]string) - br = bufio.NewReader(f) - na NameAbbrev - abbreviations []string + sm = make(map[string]set.Set) + m = make(map[string][]string) + br = bufio.NewReader(f) + nameAbbrev struct { + Name string `json:"name"` + Abbrev string `json:"abbrev"` + } + abbrevs []string + key string ) for { b, err := br.ReadBytes('\n') @@ -130,19 +130,19 @@ func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][] if err != nil { return nil, nil, err } - if err := json.Unmarshal(b, &na); err != nil { + if err := json.Unmarshal(b, &nameAbbrev); err != nil { return nil, nil, err } - key := strings.ToLower(na.Abbrev) + key = strings.ToLower(nameAbbrev.Abbrev) if _, ok := sm[key]; !ok { sm[key] = set.New() } - sm[key].Add(strings.ToLower(na.Name)) + sm[key].Add(strings.ToLower(nameAbbrev.Name)) } for k, v := range sm { - abbreviations = append(abbreviations, k) + abbrevs = append(abbrevs, k) m[k] = v.Slice() } - matchSet := skate.NewMatchSet(abbreviations) + matchSet := skate.NewMatchSet(abbrevs) return matchSet, m, nil } diff --git a/skate/url_test.go b/skate/url_test.go index 5e36cae..8103246 100644 --- a/skate/url_test.go +++ b/skate/url_test.go @@ -131,6 +131,14 @@ func TestSanitizeURL(t *testing.T) { `http://hdl.handle.net/1765/1163`}, {`cdec.water.ca.gov/misc/DailyPrecip.html`, `http://cdec.water.ca.gov/misc/DailyPrecip.html`}, + {`https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados.Accessed22`, + `https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados`}, + {`https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`, // TODO: e.g. remove "doi:" or the like + `https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`}, + {`mail:claire.wyart@icm-institute.org,claire.wyart@inserm.frhttp://dx.doi.org/10.1016/j.cub.2015.01.006`, + `http://dx.doi.org/10.1016/j.cub.2015.01.006`}, + {`http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684.Acessoem:15/07/2013`, + `http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684`}, } for _, c := range cases { out := SanitizeURL(c.in) |