aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go1
-rw-r--r--skate/schema.go34
-rw-r--r--skate/unstructured.go4
-rw-r--r--skate/unstructured_test.go53
4 files changed, 74 insertions, 18 deletions
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index d1a21e9..c4fdb1e 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -1,3 +1,4 @@
+// skate-wikipedia-doi extracts DOI from wikipedia reference dataset.
package main
import (
diff --git a/skate/schema.go b/skate/schema.go
index a9570b7..9f3af45 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -112,24 +112,26 @@ func parseIsbn(s string) []string {
return valid.Slice()
}
+type Biblio struct {
+ ArxivId string `json:"arxiv_id,omitempty"`
+ ContainerName string `json:"container_name,omitempty"`
+ ContribRawNames []string `json:"contrib_raw_names,omitempty"`
+ DOI string `json:"doi,omitempty"`
+ Issue string `json:"issue,omitempty"`
+ PMCID string `json:"pmcid,omitempty"`
+ PMID string `json:"pmid,omitempty"`
+ Pages string `json:"pages,omitempty"`
+ Publisher string `json:"publisher,omitempty"`
+ Title string `json:"title,omitempty"`
+ Unstructured string `json:"unstructured,omitempty"`
+ Url string `json:"url,omitempty"`
+ Volume string `json:"volume,omitempty"`
+ Year int64 `json:"year,omitempty"`
+}
+
// Ref is a reference document, can be very partial.
type Ref struct {
- Biblio struct {
- ArxivId string `json:"arxiv_id,omitempty"`
- ContainerName string `json:"container_name,omitempty"`
- ContribRawNames []string `json:"contrib_raw_names,omitempty"`
- DOI string `json:"doi,omitempty"`
- Issue string `json:"issue,omitempty"`
- PMCID string `json:"pmcid,omitempty"`
- PMID string `json:"pmid,omitempty"`
- Pages string `json:"pages,omitempty"`
- Publisher string `json:"publisher,omitempty"`
- Title string `json:"title,omitempty"`
- Unstructured string `json:"unstructured,omitempty"`
- Url string `json:"url,omitempty"`
- Volume string `json:"volume,omitempty"`
- Year int64 `json:"year,omitempty"`
- } `json:"biblio"`
+ Biblio Biblio `json:"biblio"`
Index int64 `json:"index,omitempty"`
Key string `json:"key,omitempty"`
RefSource string `json:"ref_source,omitempty"`
diff --git a/skate/unstructured.go b/skate/unstructured.go
index 6a96bb0..082c685 100644
--- a/skate/unstructured.go
+++ b/skate/unstructured.go
@@ -8,8 +8,8 @@ import (
var (
PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
- PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
- PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+ PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+ PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
urlPrefixes = []string{
"http://doi.org/",
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
new file mode 100644
index 0000000..e6e9fbd
--- /dev/null
+++ b/skate/unstructured_test.go
@@ -0,0 +1,53 @@
+package skate
+
+import (
+ "reflect"
+ "testing"
+)
+
+func TestParseUnstructured(t *testing.T) {
+ var cases = []struct {
+ ref *Ref
+ result *Ref
+ err error
+ }{
+ {
+ &Ref{
+ Biblio: Biblio{
+ Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ &Ref{
+ Biblio: Biblio{
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ nil,
+ },
+ {
+ &Ref{
+ Biblio: Biblio{
+ Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ &Ref{
+ Biblio: Biblio{
+ ArxivId: "0808.3320",
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ nil,
+ },
+ }
+ for _, c := range cases {
+ err := ParseUnstructured(c.ref)
+ if err != c.err {
+ t.Fatalf("got %v, want %v", err, c.err)
+ }
+ if !reflect.DeepEqual(c.ref, c.result) {
+ t.Fatalf("got %#v, want %#v", c.ref, c.result)
+ }
+ }
+}