From 13f89091ed93c5166e0fd969665e3e9f2c909ca9 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 00:20:03 +0200 Subject: add test for ParseUnstructured --- skate/cmd/skate-wikipedia-doi/main.go | 1 + skate/schema.go | 34 +++++++++++----------- skate/unstructured.go | 4 +-- skate/unstructured_test.go | 53 +++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 18 deletions(-) create mode 100644 skate/unstructured_test.go diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index d1a21e9..c4fdb1e 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -1,3 +1,4 @@ +// skate-wikipedia-doi extracts DOI from wikipedia reference dataset. package main import ( diff --git a/skate/schema.go b/skate/schema.go index a9570b7..9f3af45 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -112,24 +112,26 @@ func parseIsbn(s string) []string { return valid.Slice() } +type Biblio struct { + ArxivId string `json:"arxiv_id,omitempty"` + ContainerName string `json:"container_name,omitempty"` + ContribRawNames []string `json:"contrib_raw_names,omitempty"` + DOI string `json:"doi,omitempty"` + Issue string `json:"issue,omitempty"` + PMCID string `json:"pmcid,omitempty"` + PMID string `json:"pmid,omitempty"` + Pages string `json:"pages,omitempty"` + Publisher string `json:"publisher,omitempty"` + Title string `json:"title,omitempty"` + Unstructured string `json:"unstructured,omitempty"` + Url string `json:"url,omitempty"` + Volume string `json:"volume,omitempty"` + Year int64 `json:"year,omitempty"` +} + // Ref is a reference document, can be very partial. type Ref struct { - Biblio struct { - ArxivId string `json:"arxiv_id,omitempty"` - ContainerName string `json:"container_name,omitempty"` - ContribRawNames []string `json:"contrib_raw_names,omitempty"` - DOI string `json:"doi,omitempty"` - Issue string `json:"issue,omitempty"` - PMCID string `json:"pmcid,omitempty"` - PMID string `json:"pmid,omitempty"` - Pages string `json:"pages,omitempty"` - Publisher string `json:"publisher,omitempty"` - Title string `json:"title,omitempty"` - Unstructured string `json:"unstructured,omitempty"` - Url string `json:"url,omitempty"` - Volume string `json:"volume,omitempty"` - Year int64 `json:"year,omitempty"` - } `json:"biblio"` + Biblio Biblio `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` RefSource string `json:"ref_source,omitempty"` diff --git a/skate/unstructured.go b/skate/unstructured.go index 6a96bb0..082c685 100644 --- a/skate/unstructured.go +++ b/skate/unstructured.go @@ -8,8 +8,8 @@ import ( var ( PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) urlPrefixes = []string{ "http://doi.org/", diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go new file mode 100644 index 0000000..e6e9fbd --- /dev/null +++ b/skate/unstructured_test.go @@ -0,0 +1,53 @@ +package skate + +import ( + "reflect" + "testing" +) + +func TestParseUnstructured(t *testing.T) { + var cases = []struct { + ref *Ref + result *Ref + err error + }{ + { + &Ref{ + Biblio: Biblio{ + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + { + &Ref{ + Biblio: Biblio{ + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + ArxivId: "0808.3320", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + } + for _, c := range cases { + err := ParseUnstructured(c.ref) + if err != c.err { + t.Fatalf("got %v, want %v", err, c.err) + } + if !reflect.DeepEqual(c.ref, c.result) { + t.Fatalf("got %#v, want %#v", c.ref, c.result) + } + } +} -- cgit v1.2.3