aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/.gitignore2
-rw-r--r--skate/Makefile3
-rw-r--r--skate/README.md93
-rw-r--r--skate/cmd/skate-bref-unmatched/main.go10
-rw-r--r--skate/cmd/skate-cluster/main.go26
-rw-r--r--skate/cmd/skate-conv/main.go (renamed from skate/cmd/skate-ref-to-release/main.go)60
-rw-r--r--skate/cmd/skate-dot/main.go5
-rw-r--r--skate/cmd/skate-from-unstructured/main.go61
-rw-r--r--skate/cmd/skate-map/main.go65
-rw-r--r--skate/cmd/skate-wikipedia-doi/main.go1
-rw-r--r--skate/go.mod4
-rw-r--r--skate/go.sum9
-rw-r--r--skate/map.go52
-rw-r--r--skate/map_test.go40
-rw-r--r--skate/schema.go38
-rw-r--r--skate/set/set.go107
-rw-r--r--skate/set/set_test.go8
-rw-r--r--skate/unstructured.go66
-rw-r--r--skate/unstructured_test.go54
-rw-r--r--skate/verify.go4
-rw-r--r--skate/zipkey/zipkey.go4
21 files changed, 449 insertions, 263 deletions
diff --git a/skate/.gitignore b/skate/.gitignore
index 5ede85f..32a9ec1 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -14,7 +14,6 @@
# Dependency directories (remove the comment below to include it)
# vendor/
#
-/skate-ref-to-release
/skate-derive-key
/skate-cluster
/skate-verify
@@ -26,3 +25,4 @@ packaging/debian/skate/usr
skate_*_amd64.deb
/skate-dot
/skate-map
+/skate-conv
diff --git a/skate/Makefile b/skate/Makefile
index 9bc70c2..39858bb 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
+TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
PKGNAME := skate
.PHONY: test
@@ -13,6 +13,7 @@ generate:
.PHONY: all
all: generate $(TARGETS)
+ go mod tidy
%: cmd/%/main.go
go build -o $@ $<
diff --git a/skate/README.md b/skate/README.md
index 11f294b..68a3f64 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -1,35 +1,50 @@
# skate
-This suite of command line tools have been written for various parts of the
-citation graph pipeline.
+A small library and suite of command line tools related to generating a
+[citation graph](https://en.wikipedia.org/wiki/Citation_graph).
-Python was a bit too slow, even when parallelized, e.g. for generating clusters
-of similar documents or to do verification. An option for the future would be
-to resort to [Cython](https://cython.org/). Parts of
-[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported to Go for
-performance.
+> There is no standard format for the citations in bibliographies, and the
+> record linkage of citations can be a time-consuming and complicated process.
+
+## Background
+
+Python was a bit too slow, even when parallelized (with GNU parallel), e.g. for
+generating clusters of similar documents or to do verification. An option for
+the future would be to resort to [Cython](https://cython.org/). Parts of
+[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported into this
+project for performance (and we saw a 25x speedup for certain tasks).
![](static/zipkey.png)
-## Tools
+## Core Utils
-### skate-wikipedia-doi
+* `skate-derive-key`, will be: `skate-map`
+* `skate-cluster`
+* `skate-verify-*`
-TSV (page title, DOI, doc) from wikipedia refs.
+The `skate-derive-key` tool derives a key from release entity JSON documents.
```
-$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi
-Rational point 10.1515/crll.1988.386.32 {"type_of_citation" ...
-Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ...
+$ skate-derive-key < release_entities.jsonlines > docs.tsv
+```
+
+Result will be a three column TSV (ident, key, doc).
+
```
+---- ident --------------- ---- key --------- ---- doc ----------
-### skate-bref-id
+4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],...
+```
-Temporary helper to add a key to a biblioref document.
+After this step:
-### skate-cluster
+* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...`
+* cluster, e.g. `skate-cluster ...`
-Converts a sorted key output into a jsonlines clusters.
+----
+
+The `skate-cluster` tool converts a sorted key output into a jsonlines
+clusters.
For example, this:
@@ -42,46 +57,38 @@ would turn into (a single line containing all docs with the same key).
A single line cluster is easier to parallelize (e.g. for verification, etc.).
-### skate-derive-key
+----
-skate-derive-key derives a key from release entity JSON documents.
+The `skate-verify-*` tools run various matching and verification algorithms.
-```
-$ skate-derive-key < release_entities.jsonlines > docs.tsv
-```
+## Extra
-Result will be a three column TSV (ident, key, doc).
+* skate-wikipedia-doi
-```
----- ident --------------- ---- key --------- ---- doc ----------
+> TSV (page title, DOI, doc) from wikipedia refs.
-4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],...
+```
+$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi
+Rational point 10.1515/crll.1988.386.32 {"type_of_citation" ...
+Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ...
```
-After this step:
-
-* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...`
-* cluster, e.g. `skate-cluster ...`
-
-### skate-from-unstructured
-
-Takes a refs file and plucks out identifiers from unstructured field.
-
-### skate-ref-to-release
+* skate-bref-id
-Converts a ref document to a release. Part of first run, merging refs and releases.
+> Temporary helper to add a key to a biblioref document.
-### skate-to-doi
+* skate-from-unstructured
-Sanitize DOI in tabular file.
+> Takes a refs file and plucks out identifiers from unstructured field.
-### skate-verify
+* skate-conv
-Run various matching and verification algorithms.
+> Converts a ref (or open library) document to a release. Part of first step,
+> merging refs and releases.
-### skate-map
+* skate-to-doi
-A more generic version of derive key.
+> Sanitize DOI in tabular file.
## Misc
diff --git a/skate/cmd/skate-bref-unmatched/main.go b/skate/cmd/skate-bref-unmatched/main.go
deleted file mode 100644
index d8cb34f..0000000
--- a/skate/cmd/skate-bref-unmatched/main.go
+++ /dev/null
@@ -1,10 +0,0 @@
-// skate-bref-unmatched takes a bref TSV sorted by source_release_ident and a
-// refs file sorted by release_ident and exports a bref file that will include
-// unmatched references as well.
-package main
-
-import "log"
-
-func main() {
- log.Println("skate-bref-unmatched")
-}
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 754eab8..de11de1 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,5 @@
-// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
+// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
// require refs and release docs in a single cluster).
//
// For example, this:
@@ -44,10 +44,12 @@ func main() {
batch, fields []string
keyIndex = *keyField - 1
docIndex = *docField - 1
+ line string
+ err error
)
defer bw.Flush()
for {
- line, err := br.ReadString('\n')
+ line, err = br.ReadString('\n')
if err == io.EOF {
break
}
@@ -79,16 +81,16 @@ func main() {
// containsBoth return true, if we have a ref and a non-ref item in the batch.
func containsBoth(batch []string) bool {
- var isRef int
+ var numRef int
for _, doc := range batch {
- // This is brittle. Most JSON should be in compact form, and there the
- // following chars are by convention added to distinguish a release
- // coming from a reference doc from other releases.
+ // This is brittle (but faster). Most JSON should be in compact form,
+ // and there the following chars are by convention added to distinguish
+ // a release coming from a reference doc from other releases.
if strings.Contains(doc, `"status":"ref"`) {
- isRef++
+ numRef++
}
}
- return isRef > 0 && isRef < len(batch)
+ return numRef > 0 && numRef < len(batch)
}
// writeBatch writes out a single line containing the key and the cluster values.
@@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) {
if *requireBoth && !containsBoth(batch) {
return nil
}
- // This is brittle, but all items in a batch are valid JSON objects, hence,
- // the following will be valid JSON as well, or will it? The key should not
- // contain a quote.
+ // This is brittle (and fast), but all items in a batch are valid JSON
+ // objects, hence, the following will be valid JSON as well, or will it?
+ // The key should not contain a quote.
_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
return
}
diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-conv/main.go
index d547e62..647472e 100644
--- a/skate/cmd/skate-ref-to-release/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -1,5 +1,9 @@
-// skate-ref-to-release converts a "ref" document to a "release" document.
+// skate-conv converts various schemas into releases. This should replace the
+// very specific skate-ref-to-release and the like.
//
+// $ skate-conv -f ref < FILE > FILE
+//
+// Currently source schemas: "ref", "ol", "rg"
package main
import (
@@ -10,19 +14,38 @@ import (
"strings"
"git.archive.org/martin/cgraph/skate"
- "github.com/miku/parallel"
-
+ "git.archive.org/martin/cgraph/skate/parallel"
json "github.com/segmentio/encoding/json"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import data shape")
+ fromFormat = flag.String("f", "ref", "import schema")
bytesNewline = []byte("\n")
+ f func([]byte) ([]byte, error)
)
+func main() {
+ flag.Parse()
+ switch *fromFormat {
+ case "ref":
+ f = refToRelease
+ case "rg":
+ f = rgSitemapToRelease
+ case "ol":
+ f = openLibraryToRelease
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+// refToRelease converts a ref document to a release.
func refToRelease(p []byte) ([]byte, error) {
var ref skate.Ref
if err := json.Unmarshal(p, &ref); err != nil {
@@ -60,22 +83,17 @@ func rgSitemapToRelease(p []byte) ([]byte, error) {
return b, err
}
-func main() {
- flag.Parse()
- switch *fromFormat {
- case "ref":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "rg":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
+func openLibraryToRelease(p []byte) ([]byte, error) {
+ var w skate.OpenLibraryWork
+ if err := json.Unmarshal(p, &w); err != nil {
+ return nil, err
}
+ release, err := skate.OpenLibraryToRelease(&w)
+ if err != nil {
+ return nil, err
+ }
+ release.Extra.Skate.Status = "ol"
+ b, err := json.Marshal(release)
+ b = append(b, bytesNewline...)
+ return b, err
}
diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go
index 3ef99d5..573209e 100644
--- a/skate/cmd/skate-dot/main.go
+++ b/skate/cmd/skate-dot/main.go
@@ -1,5 +1,6 @@
-// skate-dot generates dot files from inbound and outbound citation links. Just
-// a demo, replacement for a couple python scripts.
+// [wip] skate-dot generates dot files from inbound and outbound citation
+// links. Just a demo, replacement for a couple python scripts. We want things
+// like: https://git.io/JObzq.
package main
import (
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index c2015e2..179057d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -6,9 +6,7 @@ import (
"flag"
"log"
"os"
- "regexp"
"runtime"
- "strings"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
@@ -19,11 +17,6 @@ var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
bytesNewline = []byte("\n")
-
- PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
- PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
- PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
- PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
)
func main() {
@@ -32,7 +25,7 @@ func main() {
if err := json.Unmarshal(p, &ref); err != nil {
return nil, err
}
- if err := parseUnstructured(&ref); err != nil {
+ if err := skate.ParseUnstructured(&ref); err != nil {
return nil, err
}
return skate.JsonMarshalLine(&ref)
@@ -43,55 +36,3 @@ func main() {
log.Fatal(err)
}
}
-
-// parseUnstructured will in-place augment missing DOI, arxiv id and so on.
-func parseUnstructured(ref *skate.Ref) error {
- uns := ref.Biblio.Unstructured
- var (
- v string
- vs []string
- )
- // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
- // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
- if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
- parts := strings.Split(strings.ToLower(ref.Key), "-bib")
- ref.Biblio.DOI = parts[0]
- }
- // DOI
- v = PatDOI.FindString(uns)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // DOI in Key
- v = PatDOINoHyphen.FindString(ref.Key)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // DOI in URL
- prefixes := []string{
- "http://doi.org/",
- "https://doi.org/",
- "http://dx.doi.org/",
- "https://dx.doi.org/",
- }
- for _, prefix := range prefixes {
- if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
- ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
- }
- }
- v = PatDOINoHyphen.FindString(ref.Key)
- if v != "" && ref.Biblio.DOI == "" {
- ref.Biblio.DOI = v
- }
- // Arxiv
- vs = PatArxivPDF.FindStringSubmatch(uns)
- if len(vs) != 0 && ref.Biblio.ArxivId == "" {
- ref.Biblio.ArxivId = vs[1]
- } else {
- vs = PatArxivAbs.FindStringSubmatch(uns)
- if len(vs) != 0 && ref.Biblio.ArxivId == "" {
- ref.Biblio.ArxivId = vs[1]
- }
- }
- return nil
-}
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index ee02875..227acf2 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -1,9 +1,10 @@
-// skate-map runs a given map function over input data. We mostly want to
+// skate-map runs a given "map" function over input data. Here, we mostly want to
// extract a key from a json document. For simple cases, you can use `jq` and
-// other tools. Some key derivations require a bit more.
+// other tools. Some key derivations require a bit more, hence a dedicated program.
//
-// An example with mostly unix tools. We want to extract the DOI and sort by
-// it; we also want to do this fast, hence parallel, LC_ALL, etc.
+// An example with mostly unix tools. We want to extract the DOI from newline
+// delimited JSON and sort by it; we also want to do this fast, hence parallel,
+// LC_ALL, etc.
//
// $ zstdcat -T0 file.zst | (1)
// LC_ALL=C tr -d '\t' | (2) *
@@ -21,15 +22,15 @@
// be skipped, if we limit number of splits)
// (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
// (4) we want no "null" output
-// (5) tostring prints input as string, because we need to carry the document forward
-// (6) but we need some cleanup, too
+// (5) tostring prints the input as string, because we need to carry the document forward ...
+// (6) ... but we'll need some cleanup, too
// (7) we normalize the DOI to lowercase
// (8) a custom filter to normalize a DOI in a specific column
// (9) sorting by DOI
//
// This is reasonably fast, but some cleanup is ugly. We also want more complex
-// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8).
-
+// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2)
+// to (8) with `skate-map`.
package main
import (
@@ -45,21 +46,26 @@ import (
)
var (
- mapperName = flag.String("m", "", "mapper to run")
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 50000, "batch size")
- verbose = flag.Bool("verbose", false, "show progress")
- keyPrefix = flag.String("p", "", "a key prefix to use")
- extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ mapperName = flag.String("m", "", "mapper to run")
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 50000, "batch size")
+ verbose = flag.Bool("verbose", false, "show progress")
+ keyPrefix = flag.String("p", "", "a key prefix to use")
+ extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ bestEffort = flag.Bool("B", false, "best effort")
+ logFile = flag.String("log", "", "log filename")
+ skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given column (zero indexed)")
+
+ help = `skate-map available mappers
+
+ $ skate-map -m ts < file.ndj > file.tsv
+ `
)
func main() {
flag.Parse()
- // TODO
- // [ ] add prefixes and a way to derive multiple keys in one go
- // [ ] how to store multiple keys, sorted?
- // [ ] maybe wrap jq and parallel for arbitrary nested keys
availableMappers := map[string]skate.Mapper{
+ // Add new mapper functions here.
"id": skate.Identity,
"ff": skate.CreateFixedMapper(*extraValue),
"ti": skate.MapperTitle,
@@ -67,15 +73,29 @@ func main() {
"ty": skate.MapperTitleNysiis,
"ts": skate.MapperTitleSandcrawler,
}
+ if *logFile != "" {
+ f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ log.SetOutput(f)
+ }
switch {
case *mapperName != "":
- if f, ok := availableMappers[*mapperName]; !ok {
+ if mapf, ok := availableMappers[*mapperName]; !ok {
log.Fatalf("unknown mapper name: %v", *mapperName)
} else {
+ if *skipOnEmpty >= 0 {
+ mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty)
+ }
if *keyPrefix != "" {
- f = skate.WithPrefix(f, *keyPrefix)
+ mapf = skate.WithPrefix(mapf, *keyPrefix)
+ }
+ if *bestEffort {
+ mapf = skate.WithBestEffort(mapf)
}
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV)
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
pp.Verbose = *verbose
@@ -84,8 +104,7 @@ func main() {
}
}
default:
- fmt.Println("skate-map available mappers")
- fmt.Println()
+ fmt.Println(help)
w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
for k, v := range availableMappers {
fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index d1a21e9..c4fdb1e 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -1,3 +1,4 @@
+// skate-wikipedia-doi extracts DOI from wikipedia reference dataset.
package main
import (
diff --git a/skate/go.mod b/skate/go.mod
index 49ef5d2..57ae586 100644
--- a/skate/go.mod
+++ b/skate/go.mod
@@ -5,10 +5,10 @@ go 1.15
require (
github.com/elastic/go-elasticsearch v0.0.0
github.com/elastic/go-elasticsearch/v7 v7.12.0
+ github.com/klauspost/cpuid/v2 v2.0.6 // indirect
github.com/matryer/is v1.4.0
- github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c
github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e
github.com/segmentio/encoding v0.2.17
github.com/tidwall/gjson v1.7.5
- golang.org/x/text v0.3.5
+ golang.org/x/text v0.3.6
)
diff --git a/skate/go.sum b/skate/go.sum
index a186bcd..96d323d 100644
--- a/skate/go.sum
+++ b/skate/go.sum
@@ -2,12 +2,11 @@ github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWn
github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg=
github.com/elastic/go-elasticsearch/v7 v7.12.0 h1:j4tvcMrZJLp39L2NYvBb7f+lHKPqPHSL3nvB8+/DV+s=
github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4=
-github.com/klauspost/cpuid/v2 v2.0.5 h1:qnfhwbFriwDIX51QncuNU5mEMf+6KE3t7O8V2KQl3Dg=
github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI=
+github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE=
github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
-github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c h1:w1k+oAL6cD9oNI2LXgyCHXKJzgD7WXn/09+cdkMgZJ4=
-github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c/go.mod h1:m4hVixrXwk3DUp5cQ1j661BsHpjqSc/SfXE0uUMxmAw=
github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY=
github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs=
github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc=
@@ -18,6 +17,6 @@ github.com/tidwall/match v1.0.3 h1:FQUVvBImDutD8wJLN6c5eMzWtjgONK9MwIBCOrUJKeE=
github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.1.0 h1:K3hMW5epkdAVwibsQEfR/7Zj0Qgt4DxtNumTq/VloO8=
github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
-golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/skate/map.go b/skate/map.go
index 9d3c98d..d6e37be 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -25,10 +25,11 @@ type TitleDoc struct {
}
// PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699
-// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on.
+// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX
type PartialDoc struct {
ContainerName string `json:"container_name"`
Contribs []struct {
+ // XXX: Need a way to sensibly compare sets of author names.
RawName string `json:"raw_name"`
} `json:"contribs"`
Volume string `json:"volume"`
@@ -40,17 +41,29 @@ type PartialDoc struct {
// doc). We want fields, but we do not want to bake in TSV into each function.
type Mapper func([]byte) ([][]byte, error)
-// AsTSV serializes the result of a field mapper as TSV. This is a slim adapter,
-// e.g. to parallel.Processor, which expects this function signature.
+// AsTSV serializes the result of a field mapper as TSV. This is a slim
+// adapter, e.g. to parallel.Processor, which expects this function signature.
+// A newline will be appended, if not there already.
func (f Mapper) AsTSV(p []byte) ([]byte, error) {
- fields, err := f(p)
- if err != nil {
+ var (
+ fields [][]byte
+ err error
+ b []byte
+ )
+ if fields, err = f(p); err != nil {
return nil, err
}
- return bytes.Join(fields, bTab), nil
+ if len(fields) == 0 {
+ return nil, nil
+ }
+ b = bytes.Join(fields, bTab)
+ if len(b) > 0 && !bytes.HasSuffix(b, bNewline) {
+ b = append(b, bNewline...)
+ }
+ return b, nil
}
-// WithPrefix adds a given prefix to the first element.
+// WithPrefix is a "mapper middleware", adding a given prefix to the first field.
func WithPrefix(f Mapper, prefix string) Mapper {
return func(p []byte) ([][]byte, error) {
fields, err := f(p)
@@ -65,6 +78,31 @@ func WithPrefix(f Mapper, prefix string) Mapper {
}
}
+// WithBestEffort will not fail on an error.
+func WithBestEffort(f Mapper) Mapper {
+ return func(p []byte) ([][]byte, error) {
+ if fields, err := f(p); err != nil {
+ return nil, nil
+ } else {
+ return fields, err
+ }
+ }
+}
+
+// WithSkipOnEmpty ignores results where the value at a given field is empty.
+func WithSkipOnEmpty(f Mapper, index int) Mapper {
+ return func(p []byte) ([][]byte, error) {
+ fields, err := f(p)
+ if err != nil {
+ return nil, err
+ }
+ if index < len(fields) && len(fields[index]) == 0 {
+ return nil, nil
+ }
+ return fields, err
+ }
+}
+
// NameOf returns name of value, e.g. the name of a function.
func NameOf(f interface{}) string {
v := reflect.ValueOf(f)
diff --git a/skate/map_test.go b/skate/map_test.go
index a439d33..a81cb3d 100644
--- a/skate/map_test.go
+++ b/skate/map_test.go
@@ -149,6 +149,46 @@ func TestMapperTitleSandcrawler(t *testing.T) {
}
}
+func TestAsTSV(t *testing.T) {
+ var cases = []struct {
+ f Mapper
+ err error
+ want string
+ }{
+ {
+ f: Mapper(func(_ []byte) ([][]byte, error) {
+ return [][]byte{
+ []byte("a"),
+ []byte("b"),
+ []byte("c"),
+ }, nil
+ }),
+ err: nil,
+ want: "a\tb\tc\n",
+ },
+ {
+ f: Mapper(func(_ []byte) ([][]byte, error) {
+ return [][]byte{
+ []byte("a"),
+ []byte("b"),
+ []byte("c\n"),
+ }, nil
+ }),
+ err: nil,
+ want: "a\tb\tc\n",
+ },
+ }
+ for _, c := range cases {
+ got, err := c.f.AsTSV([]byte{})
+ if err != c.err {
+ t.Fatalf("got %v, want nil", got)
+ }
+ if string(got) != c.want {
+ t.Fatalf("got %v, want %v", string(got), c.want)
+ }
+ }
+}
+
func prettySlice(p [][]byte) (result []string) {
result = make([]string, len(p))
for i, v := range p {
diff --git a/skate/schema.go b/skate/schema.go
index d58d1e8..9f3af45 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -65,7 +65,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
return &release, nil
}
-// parseIsbn tries to find and validate ISBN from unstrucuted data.
+// parseIsbn tries to find and validate ISBN from unstructured data.
func parseIsbn(s string) []string {
// ISBN: 10: 0137822693, pp: 373
// Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec,
@@ -80,7 +80,7 @@ func parseIsbn(s string) []string {
u []rune
z string
)
- valid := setPool.Get().(*set.Set)
+ valid := setPool.Get().(set.Set)
valid.Clear()
defer setPool.Put(valid)
for _, v := range append(candidates10, candidates13...) {
@@ -112,24 +112,26 @@ func parseIsbn(s string) []string {
return valid.Slice()
}
+type Biblio struct {
+ ArxivId string `json:"arxiv_id,omitempty"`
+ ContainerName string `json:"container_name,omitempty"`
+ ContribRawNames []string `json:"contrib_raw_names,omitempty"`
+ DOI string `json:"doi,omitempty"`
+ Issue string `json:"issue,omitempty"`
+ PMCID string `json:"pmcid,omitempty"`
+ PMID string `json:"pmid,omitempty"`
+ Pages string `json:"pages,omitempty"`
+ Publisher string `json:"publisher,omitempty"`
+ Title string `json:"title,omitempty"`
+ Unstructured string `json:"unstructured,omitempty"`
+ Url string `json:"url,omitempty"`
+ Volume string `json:"volume,omitempty"`
+ Year int64 `json:"year,omitempty"`
+}
+
// Ref is a reference document, can be very partial.
type Ref struct {
- Biblio struct {
- ArxivId string `json:"arxiv_id,omitempty"`
- ContainerName string `json:"container_name,omitempty"`
- ContribRawNames []string `json:"contrib_raw_names,omitempty"`
- DOI string `json:"doi,omitempty"`
- Issue string `json:"issue,omitempty"`
- PMCID string `json:"pmcid,omitempty"`
- PMID string `json:"pmid,omitempty"`
- Pages string `json:"pages,omitempty"`
- Publisher string `json:"publisher,omitempty"`
- Title string `json:"title,omitempty"`
- Unstructured string `json:"unstructured,omitempty"`
- Url string `json:"url,omitempty"`
- Volume string `json:"volume,omitempty"`
- Year int64 `json:"year,omitempty"`
- } `json:"biblio"`
+ Biblio Biblio `json:"biblio"`
Index int64 `json:"index,omitempty"`
Key string `json:"key,omitempty"`
RefSource string `json:"ref_source,omitempty"`
diff --git a/skate/set/set.go b/skate/set/set.go
index 6bad47e..b762cb8 100644
--- a/skate/set/set.go
+++ b/skate/set/set.go
@@ -8,31 +8,47 @@ import (
// Set implements basic string set operations, not thread-safe.
type Set map[string]struct{}
-func (s *Set) Clear() {
- for k := range *s {
- delete(*s, k)
+// New creates a new set.
+func New() Set {
+ var s = make(Set)
+ return s
+}
+
+// FromSlice initializes a set from a slice.
+func FromSlice(vs []string) Set {
+ s := New()
+ for _, v := range vs {
+ s.Add(v)
+ }
+ return s
+}
+
+// Clear removes all elements.
+func (s Set) Clear() {
+ for k := range s {
+ delete(s, k)
}
}
// Add adds an element.
-func (s *Set) Add(v string) *Set {
- (*s)[v] = struct{}{}
+func (s Set) Add(v string) Set {
+ s[v] = struct{}{}
return s
}
// Len returns number of elements in set.
-func (s *Set) Len() int {
- return len(*s)
+func (s Set) Len() int {
+ return len(s)
}
// IsEmpty returns if set has zero elements.
-func (s *Set) IsEmpty() bool {
+func (s Set) IsEmpty() bool {
return s.Len() == 0
}
// Equals returns true, if sets contain the same elements.
-func (s *Set) Equals(t *Set) bool {
- for k := range *s {
+func (s Set) Equals(t Set) bool {
+ for k := range s {
if !t.Contains(k) {
return false
}
@@ -41,13 +57,13 @@ func (s *Set) Equals(t *Set) bool {
}
// Contains returns membership status.
-func (s *Set) Contains(v string) bool {
- _, ok := (*s)[v]
+func (s Set) Contains(v string) bool {
+ _, ok := (s)[v]
return ok
}
// Intersection returns a new set containing all elements found in both sets.
-func (s *Set) Intersection(t *Set) *Set {
+func (s Set) Intersection(t Set) Set {
u := New()
for _, v := range s.Slice() {
if t.Contains(v) {
@@ -58,38 +74,38 @@ func (s *Set) Intersection(t *Set) *Set {
}
// Union returns the union of two sets.
-func (s *Set) Union(t *Set) *Set {
+func (s Set) Union(t Set) Set {
u := New()
- for _, v := range s.Slice() {
- u.Add(v)
+ for k := range s {
+ u.Add(k)
}
- for _, v := range t.Slice() {
- u.Add(v)
+ for k := range t {
+ u.Add(k)
}
return u
}
// Slice returns all elements as a slice.
-func (s *Set) Slice() (result []string) {
- for k := range *s {
+func (s Set) Slice() (result []string) {
+ for k := range s {
result = append(result, k)
}
return
}
-// SortedSlice returns all elements as a slice, sorted.
-func (s *Set) SortedSlice() (result []string) {
- for k := range *s {
+// Sorted returns all elements as a slice, sorted.
+func (s Set) Sorted() (result []string) {
+ for k := range s {
result = append(result, k)
}
sort.Strings(result)
return
}
-// TopK returns at most k elements.
-func (s *Set) TopK(k int) *Set {
+// TopK returns at most k sorted elements.
+func (s Set) TopK(k int) Set {
var top []string
- for i, v := range s.SortedSlice() {
+ for i, v := range s.Sorted() {
if i < k {
top = append(top, v)
}
@@ -97,17 +113,19 @@ func (s *Set) TopK(k int) *Set {
return FromSlice(top)
}
-func (s *Set) Product(t *Set) (result [][]string) {
- for k := range *s {
- for l := range *t {
+// Product returns a slice of pairs, representing the cartesian product of two sets.
+func (s Set) Product(t Set) (result [][]string) {
+ for k := range s {
+ for l := range t {
result = append(result, []string{k, l})
}
}
return
}
-// Jaccard returns the jaccard index of sets s and t.
-func (s *Set) Jaccard(t *Set) float64 {
+// Jaccard returns the jaccard index of sets s and t, between 0 and 1, where 1
+// means equality.
+func (s Set) Jaccard(t Set) float64 {
if s.IsEmpty() && t.IsEmpty() {
return 1
}
@@ -118,12 +136,13 @@ func (s *Set) Jaccard(t *Set) float64 {
}
}
-func (s *Set) Join(sep string) string {
+// Join joins elements from a set with given separator.
+func (s Set) Join(sep string) string {
return strings.Join(s.Slice(), sep)
}
// Max returns the size of the largest set.
-func Max(ss ...*Set) (max int) {
+func Max(ss ...Set) (max int) {
for _, s := range ss {
if s.Len() > max {
max = s.Len()
@@ -133,7 +152,7 @@ func Max(ss ...*Set) (max int) {
}
// Min returns the size of the smallest set.
-func Min(ss ...*Set) (min int) {
+func Min(ss ...Set) (min int) {
min = 2 << 30
for _, s := range ss {
if s.Len() < min {
@@ -143,27 +162,13 @@ func Min(ss ...*Set) (min int) {
return
}
-func Filter(s *Set, f func(string) bool) *Set {
+// Filter returns a set containing all elements, which satisfy a given predicate.
+func Filter(s Set, f func(string) bool) Set {
t := New()
- for v := range *s {
+ for v := range s {
if f(v) {
t.Add(v)
}
}
return t
}
-
-// New creates a new set.
-func New() *Set {
- s := make(Set)
- return &s
-}
-
-// FromSlice initializes a set from a slice.
-func FromSlice(vs []string) *Set {
- s := New()
- for _, v := range vs {
- s.Add(v)
- }
- return s
-}
diff --git a/skate/set/set_test.go b/skate/set/set_test.go
index 403b6df..dffb3e3 100644
--- a/skate/set/set_test.go
+++ b/skate/set/set_test.go
@@ -22,9 +22,9 @@ func TestSet(t *testing.T) {
r := make(Set)
r.Add("2")
- is.True(s.Intersection(&r).IsEmpty())
- is.Equal(s.Union(&r).Len(), 2)
- is.Equal(s.Union(&r).SortedSlice(), []string{"1", "2"})
+ is.True(s.Intersection(r).IsEmpty())
+ is.Equal(s.Union(r).Len(), 2)
+ is.Equal(s.Union(r).Sorted(), []string{"1", "2"})
r.Add("3")
r.Add("4")
@@ -35,7 +35,7 @@ func TestSet(t *testing.T) {
top := make(Set)
top.Add("2")
top.Add("3")
- is.Equal(r.TopK(2), &top)
+ is.Equal(r.TopK(2), top)
r.Clear()
is.Equal(r.Len(), 0)
diff --git a/skate/unstructured.go b/skate/unstructured.go
new file mode 100644
index 0000000..082c685
--- /dev/null
+++ b/skate/unstructured.go
@@ -0,0 +1,66 @@
+package skate
+
+import (
+ "regexp"
+ "strings"
+)
+
+var (
+ PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+ PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+ PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+ PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+
+ urlPrefixes = []string{
+ "http://doi.org/",
+ "https://doi.org/",
+ "http://dx.doi.org/",
+ "https://dx.doi.org/",
+ }
+)
+
+// ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
+func ParseUnstructured(ref *Ref) error {
+ var (
+ uns = ref.Biblio.Unstructured
+ v string
+ vs []string
+ )
+ // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
+ // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
+ if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
+ parts := strings.Split(strings.ToLower(ref.Key), "-bib")
+ ref.Biblio.DOI = parts[0]
+ }
+ // DOI
+ v = PatDOI.FindString(uns)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // DOI in Key
+ v = PatDOINoHyphen.FindString(ref.Key)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // DOI in URL
+ for _, prefix := range urlPrefixes {
+ if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
+ ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
+ }
+ }
+ v = PatDOINoHyphen.FindString(ref.Key)
+ if v != "" && ref.Biblio.DOI == "" {
+ ref.Biblio.DOI = v
+ }
+ // Arxiv
+ vs = PatArxivPDF.FindStringSubmatch(uns)
+ if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+ ref.Biblio.ArxivId = vs[1]
+ } else {
+ vs = PatArxivAbs.FindStringSubmatch(uns)
+ if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+ ref.Biblio.ArxivId = vs[1]
+ }
+ }
+ return nil
+}
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
new file mode 100644
index 0000000..41ff471
--- /dev/null
+++ b/skate/unstructured_test.go
@@ -0,0 +1,54 @@
+package skate
+
+import (
+ "reflect"
+ "testing"
+)
+
+func TestParseUnstructured(t *testing.T) {
+ // XXX: add more cases, maybe move this into files.
+ var cases = []struct {
+ ref *Ref
+ result *Ref
+ err error
+ }{
+ {
+ &Ref{
+ Biblio: Biblio{
+ Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ &Ref{
+ Biblio: Biblio{
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ nil,
+ },
+ {
+ &Ref{
+ Biblio: Biblio{
+ Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ &Ref{
+ Biblio: Biblio{
+ ArxivId: "0808.3320",
+ DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+ },
+ },
+ nil,
+ },
+ }
+ for _, c := range cases {
+ err := ParseUnstructured(c.ref)
+ if err != c.err {
+ t.Fatalf("got %v, want %v", err, c.err)
+ }
+ if !reflect.DeepEqual(c.ref, c.result) {
+ t.Fatalf("got %#v, want %#v", c.ref, c.result)
+ }
+ }
+}
diff --git a/skate/verify.go b/skate/verify.go
index 914f6a4..e6ab03e 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -505,7 +505,7 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult {
return MatchResult{StatusStrong, ReasonVersionedDOI}
}
if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 {
- getRelatedDOI := func(rel *Release) *set.Set {
+ getRelatedDOI := func(rel *Release) set.Set {
ss := set.New()
for _, rel := range rel.Extra.DataCite.Relations {
if strings.ToLower(rel.RelatedIdentifierType) != "doi" {
@@ -737,7 +737,7 @@ func parsePageString(s string) *ParsedPages {
// averageScore take a limited set of authors and calculates pairwise
// similarity scores, then returns the average of the best scores; between 0
// and 1.
-func averageScore(a, b *set.Set) float64 {
+func averageScore(a, b set.Set) float64 {
aTrimmed := a.TopK(5)
bTrimmed := b.TopK(5)
maxScores := make(map[string]float64) // For each a, keep the max.
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index a9f5c04..eb3dc55 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -1,3 +1,5 @@
+// Package zipkey implements ZipRun, a type that allows to attach a callback to
+// a group of elements taken from two streams.
package zipkey
import (
@@ -14,7 +16,7 @@ type Group struct {
}
type (
- keyFunc func(string) (string, error)
+ keyFunc func(string) (string, error) // Given a line, extract the key.
groupFunc func(*Group) error
)