From 3d61eac8c023a7f9509e0371baef40c00b0132f2 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Apr 2021 03:38:16 +0200 Subject: update docs --- skate/cmd/skate-map/main.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index ee02875..2517878 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -21,8 +21,8 @@ // be skipped, if we limit number of splits) // (3) we pass the data to jq, with a bit larger buffer (default is 1MB) // (4) we want no "null" output -// (5) tostring prints input as string, because we need to carry the document forward -// (6) but we need some cleanup, too +// (5) tostring prints the input as string, because we need to carry the document forward ... +// (6) ... but we'll need some cleanup, too // (7) we normalize the DOI to lowercase // (8) a custom filter to normalize a DOI in a specific column // (9) sorting by DOI -- cgit v1.2.3 From 77ca4cd924993188e0e9f8dd072af9f173eaad91 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Apr 2021 22:43:07 +0200 Subject: rename skate-ref-to-release to skate-conv --- python/refcat/tasks.py | 2 +- skate/Makefile | 2 +- skate/README.md | 5 +- skate/cmd/skate-conv/main.go | 99 ++++++++++++++++++++++++++++++++++ skate/cmd/skate-ref-to-release/main.go | 81 ---------------------------- 5 files changed, 104 insertions(+), 85 deletions(-) create mode 100644 skate/cmd/skate-conv/main.go delete mode 100644 skate/cmd/skate-ref-to-release/main.go (limited to 'skate/cmd') diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index df2245f..bb2685d 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -965,7 +965,7 @@ class RefsToRelease(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | - skate-ref-to-release -w 24 -b 100000 | + skate-conv -f ref -w 24 -b 100000 | zstd -T0 -c > {output} """, input=self.input().path) diff --git a/skate/Makefile b/skate/Makefile index 9bc70c2..255bc28 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map +TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map PKGNAME := skate .PHONY: test diff --git a/skate/README.md b/skate/README.md index 7effb89..d3a361c 100644 --- a/skate/README.md +++ b/skate/README.md @@ -78,9 +78,10 @@ Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ... > Takes a refs file and plucks out identifiers from unstructured field. -* skate-ref-to-release +* skate-conv -> Converts a ref document to a release. Part of first run, merging refs and releases. +> Converts a ref (or open library) document to a release. Part of first step, +> merging refs and releases. * skate-to-doi diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go new file mode 100644 index 0000000..647472e --- /dev/null +++ b/skate/cmd/skate-conv/main.go @@ -0,0 +1,99 @@ +// skate-conv converts various schemas into releases. This should replace the +// very specific skate-ref-to-release and the like. +// +// $ skate-conv -f ref < FILE > FILE +// +// Currently source schemas: "ref", "ol", "rg" +package main + +import ( + "flag" + "log" + "os" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + json "github.com/segmentio/encoding/json" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + fromFormat = flag.String("f", "ref", "import schema") + + bytesNewline = []byte("\n") + f func([]byte) ([]byte, error) +) + +func main() { + flag.Parse() + switch *fromFormat { + case "ref": + f = refToRelease + case "rg": + f = rgSitemapToRelease + case "ol": + f = openLibraryToRelease + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// refToRelease converts a ref document to a release. +func refToRelease(p []byte) ([]byte, error) { + var ref skate.Ref + if err := json.Unmarshal(p, &ref); err != nil { + return nil, err + } + release, err := skate.RefToRelease(&ref) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ref" // means: converted from ref + release.Extra.Skate.Ref.Index = ref.Index + release.Extra.Skate.Ref.Key = ref.Key + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func rgSitemapToRelease(p []byte) ([]byte, error) { + var ( + s skate.Sitemap + release skate.Release + ) + if err := json.Unmarshal(p, &s); err != nil { + return nil, err + } + release.Title = s.Title + if len(s.URL) > 41 { + // XXX: A pseudo ident, maybe irritating. + release.Ident = strings.Split(s.URL[41:], "_")[0] + } + release.Extra.Skate.Status = "rg" + release.Extra.Skate.ResearchGate.URL = s.URL + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func openLibraryToRelease(p []byte) ([]byte, error) { + var w skate.OpenLibraryWork + if err := json.Unmarshal(p, &w); err != nil { + return nil, err + } + release, err := skate.OpenLibraryToRelease(&w) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ol" + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-ref-to-release/main.go deleted file mode 100644 index d547e62..0000000 --- a/skate/cmd/skate-ref-to-release/main.go +++ /dev/null @@ -1,81 +0,0 @@ -// skate-ref-to-release converts a "ref" document to a "release" document. -// -package main - -import ( - "flag" - "log" - "os" - "runtime" - "strings" - - "git.archive.org/martin/cgraph/skate" - "github.com/miku/parallel" - - json "github.com/segmentio/encoding/json" -) - -var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import data shape") - - bytesNewline = []byte("\n") -) - -func refToRelease(p []byte) ([]byte, error) { - var ref skate.Ref - if err := json.Unmarshal(p, &ref); err != nil { - return nil, err - } - release, err := skate.RefToRelease(&ref) - if err != nil { - return nil, err - } - release.Extra.Skate.Status = "ref" // means: converted from ref - release.Extra.Skate.Ref.Index = ref.Index - release.Extra.Skate.Ref.Key = ref.Key - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err -} - -func rgSitemapToRelease(p []byte) ([]byte, error) { - var ( - s skate.Sitemap - release skate.Release - ) - if err := json.Unmarshal(p, &s); err != nil { - return nil, err - } - release.Title = s.Title - if len(s.URL) > 41 { - // XXX: A pseudo ident, maybe irritating. - release.Ident = strings.Split(s.URL[41:], "_")[0] - } - release.Extra.Skate.Status = "rg" - release.Extra.Skate.ResearchGate.URL = s.URL - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err -} - -func main() { - flag.Parse() - switch *fromFormat { - case "ref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "rg": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - } -} -- cgit v1.2.3 From 45eed4462d234f8502e38b0e98e205e341188072 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Apr 2021 23:23:32 +0200 Subject: implement a few flags as mapper middleware --- skate/cmd/skate-map/main.go | 40 ++++++++++++++++++++++++++++++---------- skate/map.go | 25 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 2517878..67fc62b 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -2,6 +2,10 @@ // extract a key from a json document. For simple cases, you can use `jq` and // other tools. Some key derivations require a bit more. // +// This tool helps us to find similar things in billions of items by mapping +// docs to key. All docs that share a key are considered match candidates and can be +// post-processed, e.g. to verify matches or to generate output schemas. +// // An example with mostly unix tools. We want to extract the DOI and sort by // it; we also want to do this fast, hence parallel, LC_ALL, etc. // @@ -29,7 +33,6 @@ // // This is reasonably fast, but some cleanup is ugly. We also want more complex // keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8). - package main import ( @@ -45,12 +48,15 @@ import ( ) var ( - mapperName = flag.String("m", "", "mapper to run") - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 50000, "batch size") - verbose = flag.Bool("verbose", false, "show progress") - keyPrefix = flag.String("p", "", "a key prefix to use") - extraValue = flag.String("x", "", "extra value to pass to configurable mappers") + mapperName = flag.String("m", "", "mapper to run") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 50000, "batch size") + verbose = flag.Bool("verbose", false, "show progress") + keyPrefix = flag.String("p", "", "a key prefix to use") + extraValue = flag.String("x", "", "extra value to pass to configurable mappers") + bestEffort = flag.Bool("B", false, "best effort") + logFile = flag.String("log", "", "log filename") + skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given field, zero indexed") ) func main() { @@ -67,15 +73,29 @@ func main() { "ty": skate.MapperTitleNysiis, "ts": skate.MapperTitleSandcrawler, } + if *logFile != "" { + f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + log.Fatal(err) + } + defer f.Close() + log.SetOutput(f) + } switch { case *mapperName != "": - if f, ok := availableMappers[*mapperName]; !ok { + if mapf, ok := availableMappers[*mapperName]; !ok { log.Fatalf("unknown mapper name: %v", *mapperName) } else { + if *skipOnEmpty >= 0 { + mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty) + } if *keyPrefix != "" { - f = skate.WithPrefix(f, *keyPrefix) + mapf = skate.WithPrefix(mapf, *keyPrefix) + } + if *bestEffort { + mapf = skate.WithBestEffort(mapf) } - pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV) + pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize pp.Verbose = *verbose diff --git a/skate/map.go b/skate/map.go index 90d8c05..d6e37be 100644 --- a/skate/map.go +++ b/skate/map.go @@ -78,6 +78,31 @@ func WithPrefix(f Mapper, prefix string) Mapper { } } +// WithBestEffort will not fail on an error. +func WithBestEffort(f Mapper) Mapper { + return func(p []byte) ([][]byte, error) { + if fields, err := f(p); err != nil { + return nil, nil + } else { + return fields, err + } + } +} + +// WithSkipOnEmpty ignores results where the value at a given field is empty. +func WithSkipOnEmpty(f Mapper, index int) Mapper { + return func(p []byte) ([][]byte, error) { + fields, err := f(p) + if err != nil { + return nil, err + } + if index < len(fields) && len(fields[index]) == 0 { + return nil, nil + } + return fields, err + } +} + // NameOf returns name of value, e.g. the name of a function. func NameOf(f interface{}) string { v := reflect.ValueOf(f) -- cgit v1.2.3 From 55647ea29aff9a942816e7d858c37d7e37e598da Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 1 May 2021 01:19:49 +0200 Subject: update docs --- skate/cmd/skate-dot/main.go | 4 ++-- skate/zipkey/zipkey.go | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go index 3ef99d5..5c11975 100644 --- a/skate/cmd/skate-dot/main.go +++ b/skate/cmd/skate-dot/main.go @@ -1,5 +1,5 @@ -// skate-dot generates dot files from inbound and outbound citation links. Just -// a demo, replacement for a couple python scripts. +// [wip] skate-dot generates dot files from inbound and outbound citation +// links. Just a demo, replacement for a couple python scripts. package main import ( diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go index 9394734..eb3dc55 100644 --- a/skate/zipkey/zipkey.go +++ b/skate/zipkey/zipkey.go @@ -1,3 +1,5 @@ +// Package zipkey implements ZipRun, a type that allows to attach a callback to +// a group of elements taken from two streams. package zipkey import ( -- cgit v1.2.3 From 3a43e67238f5acc96a36265f78b70425d078d579 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 4 May 2021 22:48:47 +0200 Subject: update docs --- skate/README.md | 2 +- skate/cmd/skate-map/main.go | 16 +++++++--------- 2 files changed, 8 insertions(+), 10 deletions(-) (limited to 'skate/cmd') diff --git a/skate/README.md b/skate/README.md index 8e2d7d1..68a3f64 100644 --- a/skate/README.md +++ b/skate/README.md @@ -18,7 +18,7 @@ project for performance (and we saw a 25x speedup for certain tasks). ## Core Utils -* `skate-derive-key`, `skate-map` +* `skate-derive-key`, will be: `skate-map` * `skate-cluster` * `skate-verify-*` diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 67fc62b..d5f22fd 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -1,13 +1,10 @@ -// skate-map runs a given map function over input data. We mostly want to +// skate-map runs a given "map" function over input data. Here, we mostly want to // extract a key from a json document. For simple cases, you can use `jq` and -// other tools. Some key derivations require a bit more. +// other tools. Some key derivations require a bit more, hence a dedicated program. // -// This tool helps us to find similar things in billions of items by mapping -// docs to key. All docs that share a key are considered match candidates and can be -// post-processed, e.g. to verify matches or to generate output schemas. -// -// An example with mostly unix tools. We want to extract the DOI and sort by -// it; we also want to do this fast, hence parallel, LC_ALL, etc. +// An example with mostly unix tools. We want to extract the DOI from newline +// delimited JSON and sort by it; we also want to do this fast, hence parallel, +// LC_ALL, etc. // // $ zstdcat -T0 file.zst | (1) // LC_ALL=C tr -d '\t' | (2) * @@ -32,7 +29,8 @@ // (9) sorting by DOI // // This is reasonably fast, but some cleanup is ugly. We also want more complex -// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8). +// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2) +// to (8) with `skate-map`. package main import ( -- cgit v1.2.3 From 6462e64ce8e61f54e1c3b1247c2039a2eddd5875 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 4 May 2021 23:18:28 +0200 Subject: skate-map: a bit more help output --- skate/cmd/skate-map/main.go | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index d5f22fd..227acf2 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -54,16 +54,18 @@ var ( extraValue = flag.String("x", "", "extra value to pass to configurable mappers") bestEffort = flag.Bool("B", false, "best effort") logFile = flag.String("log", "", "log filename") - skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given field, zero indexed") + skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given column (zero indexed)") + + help = `skate-map available mappers + + $ skate-map -m ts < file.ndj > file.tsv + ` ) func main() { flag.Parse() - // TODO - // [ ] add prefixes and a way to derive multiple keys in one go - // [ ] how to store multiple keys, sorted? - // [ ] maybe wrap jq and parallel for arbitrary nested keys availableMappers := map[string]skate.Mapper{ + // Add new mapper functions here. "id": skate.Identity, "ff": skate.CreateFixedMapper(*extraValue), "ti": skate.MapperTitle, @@ -102,8 +104,7 @@ func main() { } } default: - fmt.Println("skate-map available mappers") - fmt.Println() + fmt.Println(help) w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0) for k, v := range availableMappers { fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v)) -- cgit v1.2.3 From a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 4 May 2021 23:59:53 +0200 Subject: tweaks; move parsing out of command --- skate/cmd/skate-cluster/main.go | 26 ++++++------ skate/cmd/skate-from-unstructured/main.go | 61 +--------------------------- skate/unstructured.go | 66 +++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 72 deletions(-) create mode 100644 skate/unstructured.go (limited to 'skate/cmd') diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go index 754eab8..de11de1 100644 --- a/skate/cmd/skate-cluster/main.go +++ b/skate/cmd/skate-cluster/main.go @@ -1,5 +1,5 @@ -// skate-cluster takes the (tab) output of skate-sorted-keys and generates a -// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. +// skate-cluster takes the (tab) output of skate-map (plus sort) and generates +// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. // require refs and release docs in a single cluster). // // For example, this: @@ -44,10 +44,12 @@ func main() { batch, fields []string keyIndex = *keyField - 1 docIndex = *docField - 1 + line string + err error ) defer bw.Flush() for { - line, err := br.ReadString('\n') + line, err = br.ReadString('\n') if err == io.EOF { break } @@ -79,16 +81,16 @@ func main() { // containsBoth return true, if we have a ref and a non-ref item in the batch. func containsBoth(batch []string) bool { - var isRef int + var numRef int for _, doc := range batch { - // This is brittle. Most JSON should be in compact form, and there the - // following chars are by convention added to distinguish a release - // coming from a reference doc from other releases. + // This is brittle (but faster). Most JSON should be in compact form, + // and there the following chars are by convention added to distinguish + // a release coming from a reference doc from other releases. if strings.Contains(doc, `"status":"ref"`) { - isRef++ + numRef++ } } - return isRef > 0 && isRef < len(batch) + return numRef > 0 && numRef < len(batch) } // writeBatch writes out a single line containing the key and the cluster values. @@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) { if *requireBoth && !containsBoth(batch) { return nil } - // This is brittle, but all items in a batch are valid JSON objects, hence, - // the following will be valid JSON as well, or will it? The key should not - // contain a quote. + // This is brittle (and fast), but all items in a batch are valid JSON + // objects, hence, the following will be valid JSON as well, or will it? + // The key should not contain a quote. _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ",")) return } diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index c2015e2..179057d 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -6,9 +6,7 @@ import ( "flag" "log" "os" - "regexp" "runtime" - "strings" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" @@ -19,11 +17,6 @@ var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") bytesNewline = []byte("\n") - - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) - PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) ) func main() { @@ -32,7 +25,7 @@ func main() { if err := json.Unmarshal(p, &ref); err != nil { return nil, err } - if err := parseUnstructured(&ref); err != nil { + if err := skate.ParseUnstructured(&ref); err != nil { return nil, err } return skate.JsonMarshalLine(&ref) @@ -43,55 +36,3 @@ func main() { log.Fatal(err) } } - -// parseUnstructured will in-place augment missing DOI, arxiv id and so on. -func parseUnstructured(ref *skate.Ref) error { - uns := ref.Biblio.Unstructured - var ( - v string - vs []string - ) - // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, - // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... - if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { - parts := strings.Split(strings.ToLower(ref.Key), "-bib") - ref.Biblio.DOI = parts[0] - } - // DOI - v = PatDOI.FindString(uns) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in Key - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in URL - prefixes := []string{ - "http://doi.org/", - "https://doi.org/", - "http://dx.doi.org/", - "https://dx.doi.org/", - } - for _, prefix := range prefixes { - if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { - ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) - } - } - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // Arxiv - vs = PatArxivPDF.FindStringSubmatch(uns) - if len(vs) != 0 && ref.Biblio.ArxivId == "" { - ref.Biblio.ArxivId = vs[1] - } else { - vs = PatArxivAbs.FindStringSubmatch(uns) - if len(vs) != 0 && ref.Biblio.ArxivId == "" { - ref.Biblio.ArxivId = vs[1] - } - } - return nil -} diff --git a/skate/unstructured.go b/skate/unstructured.go new file mode 100644 index 0000000..6a96bb0 --- /dev/null +++ b/skate/unstructured.go @@ -0,0 +1,66 @@ +package skate + +import ( + "regexp" + "strings" +) + +var ( + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) + PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + + urlPrefixes = []string{ + "http://doi.org/", + "https://doi.org/", + "http://dx.doi.org/", + "https://dx.doi.org/", + } +) + +// ParseUnstructured will in-place augment missing DOI, arxiv id and so on. +func ParseUnstructured(ref *Ref) error { + var ( + uns = ref.Biblio.Unstructured + v string + vs []string + ) + // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, + // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... + if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { + parts := strings.Split(strings.ToLower(ref.Key), "-bib") + ref.Biblio.DOI = parts[0] + } + // DOI + v = PatDOI.FindString(uns) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in Key + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in URL + for _, prefix := range urlPrefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // Arxiv + vs = PatArxivPDF.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } else { + vs = PatArxivAbs.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } + } + return nil +} -- cgit v1.2.3 From 2f584059a7ec85ac1977e90f5ffeae251f956eeb Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 00:00:49 +0200 Subject: remove stub file --- skate/cmd/skate-bref-unmatched/main.go | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 skate/cmd/skate-bref-unmatched/main.go (limited to 'skate/cmd') diff --git a/skate/cmd/skate-bref-unmatched/main.go b/skate/cmd/skate-bref-unmatched/main.go deleted file mode 100644 index d8cb34f..0000000 --- a/skate/cmd/skate-bref-unmatched/main.go +++ /dev/null @@ -1,10 +0,0 @@ -// skate-bref-unmatched takes a bref TSV sorted by source_release_ident and a -// refs file sorted by release_ident and exports a bref file that will include -// unmatched references as well. -package main - -import "log" - -func main() { - log.Println("skate-bref-unmatched") -} -- cgit v1.2.3 From 13f89091ed93c5166e0fd969665e3e9f2c909ca9 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 00:20:03 +0200 Subject: add test for ParseUnstructured --- skate/cmd/skate-wikipedia-doi/main.go | 1 + skate/schema.go | 34 +++++++++++----------- skate/unstructured.go | 4 +-- skate/unstructured_test.go | 53 +++++++++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 18 deletions(-) create mode 100644 skate/unstructured_test.go (limited to 'skate/cmd') diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index d1a21e9..c4fdb1e 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -1,3 +1,4 @@ +// skate-wikipedia-doi extracts DOI from wikipedia reference dataset. package main import ( diff --git a/skate/schema.go b/skate/schema.go index a9570b7..9f3af45 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -112,24 +112,26 @@ func parseIsbn(s string) []string { return valid.Slice() } +type Biblio struct { + ArxivId string `json:"arxiv_id,omitempty"` + ContainerName string `json:"container_name,omitempty"` + ContribRawNames []string `json:"contrib_raw_names,omitempty"` + DOI string `json:"doi,omitempty"` + Issue string `json:"issue,omitempty"` + PMCID string `json:"pmcid,omitempty"` + PMID string `json:"pmid,omitempty"` + Pages string `json:"pages,omitempty"` + Publisher string `json:"publisher,omitempty"` + Title string `json:"title,omitempty"` + Unstructured string `json:"unstructured,omitempty"` + Url string `json:"url,omitempty"` + Volume string `json:"volume,omitempty"` + Year int64 `json:"year,omitempty"` +} + // Ref is a reference document, can be very partial. type Ref struct { - Biblio struct { - ArxivId string `json:"arxiv_id,omitempty"` - ContainerName string `json:"container_name,omitempty"` - ContribRawNames []string `json:"contrib_raw_names,omitempty"` - DOI string `json:"doi,omitempty"` - Issue string `json:"issue,omitempty"` - PMCID string `json:"pmcid,omitempty"` - PMID string `json:"pmid,omitempty"` - Pages string `json:"pages,omitempty"` - Publisher string `json:"publisher,omitempty"` - Title string `json:"title,omitempty"` - Unstructured string `json:"unstructured,omitempty"` - Url string `json:"url,omitempty"` - Volume string `json:"volume,omitempty"` - Year int64 `json:"year,omitempty"` - } `json:"biblio"` + Biblio Biblio `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` RefSource string `json:"ref_source,omitempty"` diff --git a/skate/unstructured.go b/skate/unstructured.go index 6a96bb0..082c685 100644 --- a/skate/unstructured.go +++ b/skate/unstructured.go @@ -8,8 +8,8 @@ import ( var ( PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) urlPrefixes = []string{ "http://doi.org/", diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go new file mode 100644 index 0000000..e6e9fbd --- /dev/null +++ b/skate/unstructured_test.go @@ -0,0 +1,53 @@ +package skate + +import ( + "reflect" + "testing" +) + +func TestParseUnstructured(t *testing.T) { + var cases = []struct { + ref *Ref + result *Ref + err error + }{ + { + &Ref{ + Biblio: Biblio{ + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + { + &Ref{ + Biblio: Biblio{ + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + ArxivId: "0808.3320", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + } + for _, c := range cases { + err := ParseUnstructured(c.ref) + if err != c.err { + t.Fatalf("got %v, want %v", err, c.err) + } + if !reflect.DeepEqual(c.ref, c.result) { + t.Fatalf("got %#v, want %#v", c.ref, c.result) + } + } +} -- cgit v1.2.3 From 134752c2a160986c13d6c2b9428cb2720ed382d0 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 00:27:32 +0200 Subject: update notes --- skate/cmd/skate-dot/main.go | 3 ++- skate/unstructured_test.go | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go index 5c11975..573209e 100644 --- a/skate/cmd/skate-dot/main.go +++ b/skate/cmd/skate-dot/main.go @@ -1,5 +1,6 @@ // [wip] skate-dot generates dot files from inbound and outbound citation -// links. Just a demo, replacement for a couple python scripts. +// links. Just a demo, replacement for a couple python scripts. We want things +// like: https://git.io/JObzq. package main import ( diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go index e6e9fbd..41ff471 100644 --- a/skate/unstructured_test.go +++ b/skate/unstructured_test.go @@ -6,6 +6,7 @@ import ( ) func TestParseUnstructured(t *testing.T) { + // XXX: add more cases, maybe move this into files. var cases = []struct { ref *Ref result *Ref -- cgit v1.2.3