aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-resolve-journal-name/main.go34
-rw-r--r--skate/url_test.go8
2 files changed, 25 insertions, 17 deletions
diff --git a/skate/cmd/skate-resolve-journal-name/main.go b/skate/cmd/skate-resolve-journal-name/main.go
index 0bd4866..8929d6c 100644
--- a/skate/cmd/skate-resolve-journal-name/main.go
+++ b/skate/cmd/skate-resolve-journal-name/main.go
@@ -32,7 +32,7 @@ var (
column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)")
sep = flag.String("d", "\t", "delimiter to use")
bestEffort = flag.Bool("B", false, "only log errors, do not halt")
- abbrevFile = flag.String("A", "", "path to abbreviate file")
+ abbrevFile = flag.String("A", "", "path to abbreviations file")
keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation")
)
@@ -102,13 +102,9 @@ func main() {
}
}
-type NameAbbrev struct {
- Name string `json:"name"`
- Abbrev string `json:"abbrev"`
-}
-
// createAbbreviationMatchSet creates a match set from JSONL abbreviations
-// file. It also returns a mapping from abbreviations to full journal names for that abbreviation.
+// file. It also returns a mapping from abbreviations to full journal names for
+// that abbreviation.
func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) {
f, err := os.Open(filename)
if err != nil {
@@ -116,11 +112,15 @@ func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]
}
defer f.Close()
var (
- sm = make(map[string]set.Set)
- m = make(map[string][]string)
- br = bufio.NewReader(f)
- na NameAbbrev
- abbreviations []string
+ sm = make(map[string]set.Set)
+ m = make(map[string][]string)
+ br = bufio.NewReader(f)
+ nameAbbrev struct {
+ Name string `json:"name"`
+ Abbrev string `json:"abbrev"`
+ }
+ abbrevs []string
+ key string
)
for {
b, err := br.ReadBytes('\n')
@@ -130,19 +130,19 @@ func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]
if err != nil {
return nil, nil, err
}
- if err := json.Unmarshal(b, &na); err != nil {
+ if err := json.Unmarshal(b, &nameAbbrev); err != nil {
return nil, nil, err
}
- key := strings.ToLower(na.Abbrev)
+ key = strings.ToLower(nameAbbrev.Abbrev)
if _, ok := sm[key]; !ok {
sm[key] = set.New()
}
- sm[key].Add(strings.ToLower(na.Name))
+ sm[key].Add(strings.ToLower(nameAbbrev.Name))
}
for k, v := range sm {
- abbreviations = append(abbreviations, k)
+ abbrevs = append(abbrevs, k)
m[k] = v.Slice()
}
- matchSet := skate.NewMatchSet(abbreviations)
+ matchSet := skate.NewMatchSet(abbrevs)
return matchSet, m, nil
}
diff --git a/skate/url_test.go b/skate/url_test.go
index 5e36cae..8103246 100644
--- a/skate/url_test.go
+++ b/skate/url_test.go
@@ -131,6 +131,14 @@ func TestSanitizeURL(t *testing.T) {
`http://hdl.handle.net/1765/1163`},
{`cdec.water.ca.gov/misc/DailyPrecip.html`,
`http://cdec.water.ca.gov/misc/DailyPrecip.html`},
+ {`https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados.Accessed22`,
+ `https://www.ibge.gov.br/estatisticas/sociais/populacao/9103-estimativas-de-populacao.html?=&t=resultados`},
+ {`https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`, // TODO: e.g. remove "doi:" or the like
+ `https://doi.org/10.1101/2020.06.23.167395doi:bioRxivpreprint`},
+ {`mail:claire.wyart@icm-institute.org,claire.wyart@inserm.frhttp://dx.doi.org/10.1016/j.cub.2015.01.006`,
+ `http://dx.doi.org/10.1016/j.cub.2015.01.006`},
+ {`http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684.Acessoem:15/07/2013`,
+ `http://www.nbcnews.com/technology/virtual-cockpit-what-it-takes-fly-drone-1C9319684`},
}
for _, c := range cases {
out := SanitizeURL(c.in)