diff options
-rw-r--r-- | README.md | 8 | ||||
-rw-r--r-- | notes/overview.md | 52 | ||||
-rw-r--r-- | python/refcat/tasks.py | 2 | ||||
-rw-r--r-- | skate/.gitignore | 2 | ||||
-rw-r--r-- | skate/Makefile | 3 | ||||
-rw-r--r-- | skate/README.md | 93 | ||||
-rw-r--r-- | skate/cmd/skate-bref-unmatched/main.go | 10 | ||||
-rw-r--r-- | skate/cmd/skate-cluster/main.go | 26 | ||||
-rw-r--r-- | skate/cmd/skate-conv/main.go (renamed from skate/cmd/skate-ref-to-release/main.go) | 60 | ||||
-rw-r--r-- | skate/cmd/skate-dot/main.go | 5 | ||||
-rw-r--r-- | skate/cmd/skate-from-unstructured/main.go | 61 | ||||
-rw-r--r-- | skate/cmd/skate-map/main.go | 65 | ||||
-rw-r--r-- | skate/cmd/skate-wikipedia-doi/main.go | 1 | ||||
-rw-r--r-- | skate/go.mod | 4 | ||||
-rw-r--r-- | skate/go.sum | 9 | ||||
-rw-r--r-- | skate/map.go | 52 | ||||
-rw-r--r-- | skate/map_test.go | 40 | ||||
-rw-r--r-- | skate/schema.go | 38 | ||||
-rw-r--r-- | skate/set/set.go | 107 | ||||
-rw-r--r-- | skate/set/set_test.go | 8 | ||||
-rw-r--r-- | skate/unstructured.go | 66 | ||||
-rw-r--r-- | skate/unstructured_test.go | 54 | ||||
-rw-r--r-- | skate/verify.go | 4 | ||||
-rw-r--r-- | skate/zipkey/zipkey.go | 4 |
24 files changed, 506 insertions, 268 deletions
@@ -1,7 +1,5 @@ # cgraph ----- - Scholarly citation graph related code; maintained by [martin@archive.org](mailto:martin@archive.org); multiple subprojects to keep all relevant code close. @@ -10,9 +8,11 @@ all relevant code close. [shiv](https://github.com/linkedin/shiv) for single-file deployments) * skate: various Go command line tools (packaged as deb) -Context: [fatcat](https://fatcat.wiki), "Mellon Grant" (20/21). +Context: [fatcat](https://fatcat.wiki), "Mellon Grant" (20/21) + +We use informal, internal versioning for the graph currently v2, next will be v3. -We use informal, internal versioning, currently v2, next will be v3. +![](https://i.imgur.com/6dSaW2q.png) # Grant related tasks diff --git a/notes/overview.md b/notes/overview.md new file mode 100644 index 0000000..8cb1200 --- /dev/null +++ b/notes/overview.md @@ -0,0 +1,52 @@ +# Overview + +## Data inputs + +Mostly JSON, but each one different in form and quality. + +Core inputs: + +* refs schema, from metadata or grobid (1-4B) +* fatcat release entities (100-200M) +* open library solr export (10-50M) + +Other inputs: + +* researchgate sitemap, titles (10-30M) +* oai-pmh harvest metadata (50-200M) +* sim (serials in microfilm, "microfilm") metadata + +Inputs related to evaluation: + +* BASE md dump (200-300M) +* Microsoft Academic, MAG (100-300M) + +Casually: + +* a single title, e.g. ILL related (1) +* lists of titles (1-1M) + +## Targets + +### BiblioRef + +Most important high level target; basic schema for current setup; elasticsearch +indexable, small JSON docs, allowing basic aggregations and lookups. + +This is not just a conversion, but may involve clustering, verification, etc. + +## Approach + +We may call it "local map-reduce", and we try to do it all in a single MR setup, e.g. + +* extract relevant fields and sort (map) +* apply computation on groups (reduce) + +As we want performance and sometimes custom code (e.g. for finding information +in unstructured data), we try to group code into a Go library with a suite of +command line tools. Easy to build and deploy. + +If the scaffoling is good, we can plug in mappers and reducers as we go, and +expose them in the tools. + + diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index d532cd4..182a51f 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -969,7 +969,7 @@ class RefsToRelease(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | - skate-ref-to-release -w 24 -b 100000 | + skate-conv -f ref -w 24 -b 100000 | zstd -T0 -c > {output} """, input=self.input().path) diff --git a/skate/.gitignore b/skate/.gitignore index 5ede85f..32a9ec1 100644 --- a/skate/.gitignore +++ b/skate/.gitignore @@ -14,7 +14,6 @@ # Dependency directories (remove the comment below to include it) # vendor/ # -/skate-ref-to-release /skate-derive-key /skate-cluster /skate-verify @@ -26,3 +25,4 @@ packaging/debian/skate/usr skate_*_amd64.deb /skate-dot /skate-map +/skate-conv diff --git a/skate/Makefile b/skate/Makefile index 9bc70c2..39858bb 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map +TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map PKGNAME := skate .PHONY: test @@ -13,6 +13,7 @@ generate: .PHONY: all all: generate $(TARGETS) + go mod tidy %: cmd/%/main.go go build -o $@ $< diff --git a/skate/README.md b/skate/README.md index 11f294b..68a3f64 100644 --- a/skate/README.md +++ b/skate/README.md @@ -1,35 +1,50 @@ # skate -This suite of command line tools have been written for various parts of the -citation graph pipeline. +A small library and suite of command line tools related to generating a +[citation graph](https://en.wikipedia.org/wiki/Citation_graph). -Python was a bit too slow, even when parallelized, e.g. for generating clusters -of similar documents or to do verification. An option for the future would be -to resort to [Cython](https://cython.org/). Parts of -[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported to Go for -performance. +> There is no standard format for the citations in bibliographies, and the +> record linkage of citations can be a time-consuming and complicated process. + +## Background + +Python was a bit too slow, even when parallelized (with GNU parallel), e.g. for +generating clusters of similar documents or to do verification. An option for +the future would be to resort to [Cython](https://cython.org/). Parts of +[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported into this +project for performance (and we saw a 25x speedup for certain tasks). ![](static/zipkey.png) -## Tools +## Core Utils -### skate-wikipedia-doi +* `skate-derive-key`, will be: `skate-map` +* `skate-cluster` +* `skate-verify-*` -TSV (page title, DOI, doc) from wikipedia refs. +The `skate-derive-key` tool derives a key from release entity JSON documents. ``` -$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi -Rational point 10.1515/crll.1988.386.32 {"type_of_citation" ... -Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ... +$ skate-derive-key < release_entities.jsonlines > docs.tsv +``` + +Result will be a three column TSV (ident, key, doc). + ``` +---- ident --------------- ---- key --------- ---- doc ---------- -### skate-bref-id +4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],... +``` -Temporary helper to add a key to a biblioref document. +After this step: -### skate-cluster +* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...` +* cluster, e.g. `skate-cluster ...` -Converts a sorted key output into a jsonlines clusters. +---- + +The `skate-cluster` tool converts a sorted key output into a jsonlines +clusters. For example, this: @@ -42,46 +57,38 @@ would turn into (a single line containing all docs with the same key). A single line cluster is easier to parallelize (e.g. for verification, etc.). -### skate-derive-key +---- -skate-derive-key derives a key from release entity JSON documents. +The `skate-verify-*` tools run various matching and verification algorithms. -``` -$ skate-derive-key < release_entities.jsonlines > docs.tsv -``` +## Extra -Result will be a three column TSV (ident, key, doc). +* skate-wikipedia-doi -``` ----- ident --------------- ---- key --------- ---- doc ---------- +> TSV (page title, DOI, doc) from wikipedia refs. -4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],... +``` +$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi +Rational point 10.1515/crll.1988.386.32 {"type_of_citation" ... +Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ... ``` -After this step: - -* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...` -* cluster, e.g. `skate-cluster ...` - -### skate-from-unstructured - -Takes a refs file and plucks out identifiers from unstructured field. - -### skate-ref-to-release +* skate-bref-id -Converts a ref document to a release. Part of first run, merging refs and releases. +> Temporary helper to add a key to a biblioref document. -### skate-to-doi +* skate-from-unstructured -Sanitize DOI in tabular file. +> Takes a refs file and plucks out identifiers from unstructured field. -### skate-verify +* skate-conv -Run various matching and verification algorithms. +> Converts a ref (or open library) document to a release. Part of first step, +> merging refs and releases. -### skate-map +* skate-to-doi -A more generic version of derive key. +> Sanitize DOI in tabular file. ## Misc diff --git a/skate/cmd/skate-bref-unmatched/main.go b/skate/cmd/skate-bref-unmatched/main.go deleted file mode 100644 index d8cb34f..0000000 --- a/skate/cmd/skate-bref-unmatched/main.go +++ /dev/null @@ -1,10 +0,0 @@ -// skate-bref-unmatched takes a bref TSV sorted by source_release_ident and a -// refs file sorted by release_ident and exports a bref file that will include -// unmatched references as well. -package main - -import "log" - -func main() { - log.Println("skate-bref-unmatched") -} diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go index 754eab8..de11de1 100644 --- a/skate/cmd/skate-cluster/main.go +++ b/skate/cmd/skate-cluster/main.go @@ -1,5 +1,5 @@ -// skate-cluster takes the (tab) output of skate-sorted-keys and generates a -// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. +// skate-cluster takes the (tab) output of skate-map (plus sort) and generates +// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. // require refs and release docs in a single cluster). // // For example, this: @@ -44,10 +44,12 @@ func main() { batch, fields []string keyIndex = *keyField - 1 docIndex = *docField - 1 + line string + err error ) defer bw.Flush() for { - line, err := br.ReadString('\n') + line, err = br.ReadString('\n') if err == io.EOF { break } @@ -79,16 +81,16 @@ func main() { // containsBoth return true, if we have a ref and a non-ref item in the batch. func containsBoth(batch []string) bool { - var isRef int + var numRef int for _, doc := range batch { - // This is brittle. Most JSON should be in compact form, and there the - // following chars are by convention added to distinguish a release - // coming from a reference doc from other releases. + // This is brittle (but faster). Most JSON should be in compact form, + // and there the following chars are by convention added to distinguish + // a release coming from a reference doc from other releases. if strings.Contains(doc, `"status":"ref"`) { - isRef++ + numRef++ } } - return isRef > 0 && isRef < len(batch) + return numRef > 0 && numRef < len(batch) } // writeBatch writes out a single line containing the key and the cluster values. @@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) { if *requireBoth && !containsBoth(batch) { return nil } - // This is brittle, but all items in a batch are valid JSON objects, hence, - // the following will be valid JSON as well, or will it? The key should not - // contain a quote. + // This is brittle (and fast), but all items in a batch are valid JSON + // objects, hence, the following will be valid JSON as well, or will it? + // The key should not contain a quote. _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ",")) return } diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-conv/main.go index d547e62..647472e 100644 --- a/skate/cmd/skate-ref-to-release/main.go +++ b/skate/cmd/skate-conv/main.go @@ -1,5 +1,9 @@ -// skate-ref-to-release converts a "ref" document to a "release" document. +// skate-conv converts various schemas into releases. This should replace the +// very specific skate-ref-to-release and the like. // +// $ skate-conv -f ref < FILE > FILE +// +// Currently source schemas: "ref", "ol", "rg" package main import ( @@ -10,19 +14,38 @@ import ( "strings" "git.archive.org/martin/cgraph/skate" - "github.com/miku/parallel" - + "git.archive.org/martin/cgraph/skate/parallel" json "github.com/segmentio/encoding/json" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import data shape") + fromFormat = flag.String("f", "ref", "import schema") bytesNewline = []byte("\n") + f func([]byte) ([]byte, error) ) +func main() { + flag.Parse() + switch *fromFormat { + case "ref": + f = refToRelease + case "rg": + f = rgSitemapToRelease + case "ol": + f = openLibraryToRelease + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// refToRelease converts a ref document to a release. func refToRelease(p []byte) ([]byte, error) { var ref skate.Ref if err := json.Unmarshal(p, &ref); err != nil { @@ -60,22 +83,17 @@ func rgSitemapToRelease(p []byte) ([]byte, error) { return b, err } -func main() { - flag.Parse() - switch *fromFormat { - case "ref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "rg": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } +func openLibraryToRelease(p []byte) ([]byte, error) { + var w skate.OpenLibraryWork + if err := json.Unmarshal(p, &w); err != nil { + return nil, err } + release, err := skate.OpenLibraryToRelease(&w) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ol" + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err } diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go index 3ef99d5..573209e 100644 --- a/skate/cmd/skate-dot/main.go +++ b/skate/cmd/skate-dot/main.go @@ -1,5 +1,6 @@ -// skate-dot generates dot files from inbound and outbound citation links. Just -// a demo, replacement for a couple python scripts. +// [wip] skate-dot generates dot files from inbound and outbound citation +// links. Just a demo, replacement for a couple python scripts. We want things +// like: https://git.io/JObzq. package main import ( diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index c2015e2..179057d 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -6,9 +6,7 @@ import ( "flag" "log" "os" - "regexp" "runtime" - "strings" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" @@ -19,11 +17,6 @@ var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") bytesNewline = []byte("\n") - - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) - PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) - PatArxivPDF = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) - PatArxivAbs = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) ) func main() { @@ -32,7 +25,7 @@ func main() { if err := json.Unmarshal(p, &ref); err != nil { return nil, err } - if err := parseUnstructured(&ref); err != nil { + if err := skate.ParseUnstructured(&ref); err != nil { return nil, err } return skate.JsonMarshalLine(&ref) @@ -43,55 +36,3 @@ func main() { log.Fatal(err) } } - -// parseUnstructured will in-place augment missing DOI, arxiv id and so on. -func parseUnstructured(ref *skate.Ref) error { - uns := ref.Biblio.Unstructured - var ( - v string - vs []string - ) - // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, - // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... - if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { - parts := strings.Split(strings.ToLower(ref.Key), "-bib") - ref.Biblio.DOI = parts[0] - } - // DOI - v = PatDOI.FindString(uns) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in Key - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // DOI in URL - prefixes := []string{ - "http://doi.org/", - "https://doi.org/", - "http://dx.doi.org/", - "https://dx.doi.org/", - } - for _, prefix := range prefixes { - if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { - ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) - } - } - v = PatDOINoHyphen.FindString(ref.Key) - if v != "" && ref.Biblio.DOI == "" { - ref.Biblio.DOI = v - } - // Arxiv - vs = PatArxivPDF.FindStringSubmatch(uns) - if len(vs) != 0 && ref.Biblio.ArxivId == "" { - ref.Biblio.ArxivId = vs[1] - } else { - vs = PatArxivAbs.FindStringSubmatch(uns) - if len(vs) != 0 && ref.Biblio.ArxivId == "" { - ref.Biblio.ArxivId = vs[1] - } - } - return nil -} diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index ee02875..227acf2 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -1,9 +1,10 @@ -// skate-map runs a given map function over input data. We mostly want to +// skate-map runs a given "map" function over input data. Here, we mostly want to // extract a key from a json document. For simple cases, you can use `jq` and -// other tools. Some key derivations require a bit more. +// other tools. Some key derivations require a bit more, hence a dedicated program. // -// An example with mostly unix tools. We want to extract the DOI and sort by -// it; we also want to do this fast, hence parallel, LC_ALL, etc. +// An example with mostly unix tools. We want to extract the DOI from newline +// delimited JSON and sort by it; we also want to do this fast, hence parallel, +// LC_ALL, etc. // // $ zstdcat -T0 file.zst | (1) // LC_ALL=C tr -d '\t' | (2) * @@ -21,15 +22,15 @@ // be skipped, if we limit number of splits) // (3) we pass the data to jq, with a bit larger buffer (default is 1MB) // (4) we want no "null" output -// (5) tostring prints input as string, because we need to carry the document forward -// (6) but we need some cleanup, too +// (5) tostring prints the input as string, because we need to carry the document forward ... +// (6) ... but we'll need some cleanup, too // (7) we normalize the DOI to lowercase // (8) a custom filter to normalize a DOI in a specific column // (9) sorting by DOI // // This is reasonably fast, but some cleanup is ugly. We also want more complex -// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8). - +// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2) +// to (8) with `skate-map`. package main import ( @@ -45,21 +46,26 @@ import ( ) var ( - mapperName = flag.String("m", "", "mapper to run") - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 50000, "batch size") - verbose = flag.Bool("verbose", false, "show progress") - keyPrefix = flag.String("p", "", "a key prefix to use") - extraValue = flag.String("x", "", "extra value to pass to configurable mappers") + mapperName = flag.String("m", "", "mapper to run") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 50000, "batch size") + verbose = flag.Bool("verbose", false, "show progress") + keyPrefix = flag.String("p", "", "a key prefix to use") + extraValue = flag.String("x", "", "extra value to pass to configurable mappers") + bestEffort = flag.Bool("B", false, "best effort") + logFile = flag.String("log", "", "log filename") + skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given column (zero indexed)") + + help = `skate-map available mappers + + $ skate-map -m ts < file.ndj > file.tsv + ` ) func main() { flag.Parse() - // TODO - // [ ] add prefixes and a way to derive multiple keys in one go - // [ ] how to store multiple keys, sorted? - // [ ] maybe wrap jq and parallel for arbitrary nested keys availableMappers := map[string]skate.Mapper{ + // Add new mapper functions here. "id": skate.Identity, "ff": skate.CreateFixedMapper(*extraValue), "ti": skate.MapperTitle, @@ -67,15 +73,29 @@ func main() { "ty": skate.MapperTitleNysiis, "ts": skate.MapperTitleSandcrawler, } + if *logFile != "" { + f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + log.Fatal(err) + } + defer f.Close() + log.SetOutput(f) + } switch { case *mapperName != "": - if f, ok := availableMappers[*mapperName]; !ok { + if mapf, ok := availableMappers[*mapperName]; !ok { log.Fatalf("unknown mapper name: %v", *mapperName) } else { + if *skipOnEmpty >= 0 { + mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty) + } if *keyPrefix != "" { - f = skate.WithPrefix(f, *keyPrefix) + mapf = skate.WithPrefix(mapf, *keyPrefix) + } + if *bestEffort { + mapf = skate.WithBestEffort(mapf) } - pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV) + pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize pp.Verbose = *verbose @@ -84,8 +104,7 @@ func main() { } } default: - fmt.Println("skate-map available mappers") - fmt.Println() + fmt.Println(help) w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0) for k, v := range availableMappers { fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v)) diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go index d1a21e9..c4fdb1e 100644 --- a/skate/cmd/skate-wikipedia-doi/main.go +++ b/skate/cmd/skate-wikipedia-doi/main.go @@ -1,3 +1,4 @@ +// skate-wikipedia-doi extracts DOI from wikipedia reference dataset. package main import ( diff --git a/skate/go.mod b/skate/go.mod index 49ef5d2..57ae586 100644 --- a/skate/go.mod +++ b/skate/go.mod @@ -5,10 +5,10 @@ go 1.15 require ( github.com/elastic/go-elasticsearch v0.0.0 github.com/elastic/go-elasticsearch/v7 v7.12.0 + github.com/klauspost/cpuid/v2 v2.0.6 // indirect github.com/matryer/is v1.4.0 - github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e github.com/segmentio/encoding v0.2.17 github.com/tidwall/gjson v1.7.5 - golang.org/x/text v0.3.5 + golang.org/x/text v0.3.6 ) diff --git a/skate/go.sum b/skate/go.sum index a186bcd..96d323d 100644 --- a/skate/go.sum +++ b/skate/go.sum @@ -2,12 +2,11 @@ github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWn github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg= github.com/elastic/go-elasticsearch/v7 v7.12.0 h1:j4tvcMrZJLp39L2NYvBb7f+lHKPqPHSL3nvB8+/DV+s= github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4= -github.com/klauspost/cpuid/v2 v2.0.5 h1:qnfhwbFriwDIX51QncuNU5mEMf+6KE3t7O8V2KQl3Dg= github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= +github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE= github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU= -github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c h1:w1k+oAL6cD9oNI2LXgyCHXKJzgD7WXn/09+cdkMgZJ4= -github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c/go.mod h1:m4hVixrXwk3DUp5cQ1j661BsHpjqSc/SfXE0uUMxmAw= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY= github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs= github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc= @@ -18,6 +17,6 @@ github.com/tidwall/match v1.0.3 h1:FQUVvBImDutD8wJLN6c5eMzWtjgONK9MwIBCOrUJKeE= github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.1.0 h1:K3hMW5epkdAVwibsQEfR/7Zj0Qgt4DxtNumTq/VloO8= github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= -golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ= -golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/skate/map.go b/skate/map.go index 9d3c98d..d6e37be 100644 --- a/skate/map.go +++ b/skate/map.go @@ -25,10 +25,11 @@ type TitleDoc struct { } // PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699 -// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. +// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX type PartialDoc struct { ContainerName string `json:"container_name"` Contribs []struct { + // XXX: Need a way to sensibly compare sets of author names. RawName string `json:"raw_name"` } `json:"contribs"` Volume string `json:"volume"` @@ -40,17 +41,29 @@ type PartialDoc struct { // doc). We want fields, but we do not want to bake in TSV into each function. type Mapper func([]byte) ([][]byte, error) -// AsTSV serializes the result of a field mapper as TSV. This is a slim adapter, -// e.g. to parallel.Processor, which expects this function signature. +// AsTSV serializes the result of a field mapper as TSV. This is a slim +// adapter, e.g. to parallel.Processor, which expects this function signature. +// A newline will be appended, if not there already. func (f Mapper) AsTSV(p []byte) ([]byte, error) { - fields, err := f(p) - if err != nil { + var ( + fields [][]byte + err error + b []byte + ) + if fields, err = f(p); err != nil { return nil, err } - return bytes.Join(fields, bTab), nil + if len(fields) == 0 { + return nil, nil + } + b = bytes.Join(fields, bTab) + if len(b) > 0 && !bytes.HasSuffix(b, bNewline) { + b = append(b, bNewline...) + } + return b, nil } -// WithPrefix adds a given prefix to the first element. +// WithPrefix is a "mapper middleware", adding a given prefix to the first field. func WithPrefix(f Mapper, prefix string) Mapper { return func(p []byte) ([][]byte, error) { fields, err := f(p) @@ -65,6 +78,31 @@ func WithPrefix(f Mapper, prefix string) Mapper { } } +// WithBestEffort will not fail on an error. +func WithBestEffort(f Mapper) Mapper { + return func(p []byte) ([][]byte, error) { + if fields, err := f(p); err != nil { + return nil, nil + } else { + return fields, err + } + } +} + +// WithSkipOnEmpty ignores results where the value at a given field is empty. +func WithSkipOnEmpty(f Mapper, index int) Mapper { + return func(p []byte) ([][]byte, error) { + fields, err := f(p) + if err != nil { + return nil, err + } + if index < len(fields) && len(fields[index]) == 0 { + return nil, nil + } + return fields, err + } +} + // NameOf returns name of value, e.g. the name of a function. func NameOf(f interface{}) string { v := reflect.ValueOf(f) diff --git a/skate/map_test.go b/skate/map_test.go index a439d33..a81cb3d 100644 --- a/skate/map_test.go +++ b/skate/map_test.go @@ -149,6 +149,46 @@ func TestMapperTitleSandcrawler(t *testing.T) { } } +func TestAsTSV(t *testing.T) { + var cases = []struct { + f Mapper + err error + want string + }{ + { + f: Mapper(func(_ []byte) ([][]byte, error) { + return [][]byte{ + []byte("a"), + []byte("b"), + []byte("c"), + }, nil + }), + err: nil, + want: "a\tb\tc\n", + }, + { + f: Mapper(func(_ []byte) ([][]byte, error) { + return [][]byte{ + []byte("a"), + []byte("b"), + []byte("c\n"), + }, nil + }), + err: nil, + want: "a\tb\tc\n", + }, + } + for _, c := range cases { + got, err := c.f.AsTSV([]byte{}) + if err != c.err { + t.Fatalf("got %v, want nil", got) + } + if string(got) != c.want { + t.Fatalf("got %v, want %v", string(got), c.want) + } + } +} + func prettySlice(p [][]byte) (result []string) { result = make([]string, len(p)) for i, v := range p { diff --git a/skate/schema.go b/skate/schema.go index d58d1e8..9f3af45 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -65,7 +65,7 @@ func RefToRelease(ref *Ref) (*Release, error) { return &release, nil } -// parseIsbn tries to find and validate ISBN from unstrucuted data. +// parseIsbn tries to find and validate ISBN from unstructured data. func parseIsbn(s string) []string { // ISBN: 10: 0137822693, pp: 373 // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, @@ -80,7 +80,7 @@ func parseIsbn(s string) []string { u []rune z string ) - valid := setPool.Get().(*set.Set) + valid := setPool.Get().(set.Set) valid.Clear() defer setPool.Put(valid) for _, v := range append(candidates10, candidates13...) { @@ -112,24 +112,26 @@ func parseIsbn(s string) []string { return valid.Slice() } +type Biblio struct { + ArxivId string `json:"arxiv_id,omitempty"` + ContainerName string `json:"container_name,omitempty"` + ContribRawNames []string `json:"contrib_raw_names,omitempty"` + DOI string `json:"doi,omitempty"` + Issue string `json:"issue,omitempty"` + PMCID string `json:"pmcid,omitempty"` + PMID string `json:"pmid,omitempty"` + Pages string `json:"pages,omitempty"` + Publisher string `json:"publisher,omitempty"` + Title string `json:"title,omitempty"` + Unstructured string `json:"unstructured,omitempty"` + Url string `json:"url,omitempty"` + Volume string `json:"volume,omitempty"` + Year int64 `json:"year,omitempty"` +} + // Ref is a reference document, can be very partial. type Ref struct { - Biblio struct { - ArxivId string `json:"arxiv_id,omitempty"` - ContainerName string `json:"container_name,omitempty"` - ContribRawNames []string `json:"contrib_raw_names,omitempty"` - DOI string `json:"doi,omitempty"` - Issue string `json:"issue,omitempty"` - PMCID string `json:"pmcid,omitempty"` - PMID string `json:"pmid,omitempty"` - Pages string `json:"pages,omitempty"` - Publisher string `json:"publisher,omitempty"` - Title string `json:"title,omitempty"` - Unstructured string `json:"unstructured,omitempty"` - Url string `json:"url,omitempty"` - Volume string `json:"volume,omitempty"` - Year int64 `json:"year,omitempty"` - } `json:"biblio"` + Biblio Biblio `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` RefSource string `json:"ref_source,omitempty"` diff --git a/skate/set/set.go b/skate/set/set.go index 6bad47e..b762cb8 100644 --- a/skate/set/set.go +++ b/skate/set/set.go @@ -8,31 +8,47 @@ import ( // Set implements basic string set operations, not thread-safe. type Set map[string]struct{} -func (s *Set) Clear() { - for k := range *s { - delete(*s, k) +// New creates a new set. +func New() Set { + var s = make(Set) + return s +} + +// FromSlice initializes a set from a slice. +func FromSlice(vs []string) Set { + s := New() + for _, v := range vs { + s.Add(v) + } + return s +} + +// Clear removes all elements. +func (s Set) Clear() { + for k := range s { + delete(s, k) } } // Add adds an element. -func (s *Set) Add(v string) *Set { - (*s)[v] = struct{}{} +func (s Set) Add(v string) Set { + s[v] = struct{}{} return s } // Len returns number of elements in set. -func (s *Set) Len() int { - return len(*s) +func (s Set) Len() int { + return len(s) } // IsEmpty returns if set has zero elements. -func (s *Set) IsEmpty() bool { +func (s Set) IsEmpty() bool { return s.Len() == 0 } // Equals returns true, if sets contain the same elements. -func (s *Set) Equals(t *Set) bool { - for k := range *s { +func (s Set) Equals(t Set) bool { + for k := range s { if !t.Contains(k) { return false } @@ -41,13 +57,13 @@ func (s *Set) Equals(t *Set) bool { } // Contains returns membership status. -func (s *Set) Contains(v string) bool { - _, ok := (*s)[v] +func (s Set) Contains(v string) bool { + _, ok := (s)[v] return ok } // Intersection returns a new set containing all elements found in both sets. -func (s *Set) Intersection(t *Set) *Set { +func (s Set) Intersection(t Set) Set { u := New() for _, v := range s.Slice() { if t.Contains(v) { @@ -58,38 +74,38 @@ func (s *Set) Intersection(t *Set) *Set { } // Union returns the union of two sets. -func (s *Set) Union(t *Set) *Set { +func (s Set) Union(t Set) Set { u := New() - for _, v := range s.Slice() { - u.Add(v) + for k := range s { + u.Add(k) } - for _, v := range t.Slice() { - u.Add(v) + for k := range t { + u.Add(k) } return u } // Slice returns all elements as a slice. -func (s *Set) Slice() (result []string) { - for k := range *s { +func (s Set) Slice() (result []string) { + for k := range s { result = append(result, k) } return } -// SortedSlice returns all elements as a slice, sorted. -func (s *Set) SortedSlice() (result []string) { - for k := range *s { +// Sorted returns all elements as a slice, sorted. +func (s Set) Sorted() (result []string) { + for k := range s { result = append(result, k) } sort.Strings(result) return } -// TopK returns at most k elements. -func (s *Set) TopK(k int) *Set { +// TopK returns at most k sorted elements. +func (s Set) TopK(k int) Set { var top []string - for i, v := range s.SortedSlice() { + for i, v := range s.Sorted() { if i < k { top = append(top, v) } @@ -97,17 +113,19 @@ func (s *Set) TopK(k int) *Set { return FromSlice(top) } -func (s *Set) Product(t *Set) (result [][]string) { - for k := range *s { - for l := range *t { +// Product returns a slice of pairs, representing the cartesian product of two sets. +func (s Set) Product(t Set) (result [][]string) { + for k := range s { + for l := range t { result = append(result, []string{k, l}) } } return } -// Jaccard returns the jaccard index of sets s and t. -func (s *Set) Jaccard(t *Set) float64 { +// Jaccard returns the jaccard index of sets s and t, between 0 and 1, where 1 +// means equality. +func (s Set) Jaccard(t Set) float64 { if s.IsEmpty() && t.IsEmpty() { return 1 } @@ -118,12 +136,13 @@ func (s *Set) Jaccard(t *Set) float64 { } } -func (s *Set) Join(sep string) string { +// Join joins elements from a set with given separator. +func (s Set) Join(sep string) string { return strings.Join(s.Slice(), sep) } // Max returns the size of the largest set. -func Max(ss ...*Set) (max int) { +func Max(ss ...Set) (max int) { for _, s := range ss { if s.Len() > max { max = s.Len() @@ -133,7 +152,7 @@ func Max(ss ...*Set) (max int) { } // Min returns the size of the smallest set. -func Min(ss ...*Set) (min int) { +func Min(ss ...Set) (min int) { min = 2 << 30 for _, s := range ss { if s.Len() < min { @@ -143,27 +162,13 @@ func Min(ss ...*Set) (min int) { return } -func Filter(s *Set, f func(string) bool) *Set { +// Filter returns a set containing all elements, which satisfy a given predicate. +func Filter(s Set, f func(string) bool) Set { t := New() - for v := range *s { + for v := range s { if f(v) { t.Add(v) } } return t } - -// New creates a new set. -func New() *Set { - s := make(Set) - return &s -} - -// FromSlice initializes a set from a slice. -func FromSlice(vs []string) *Set { - s := New() - for _, v := range vs { - s.Add(v) - } - return s -} diff --git a/skate/set/set_test.go b/skate/set/set_test.go index 403b6df..dffb3e3 100644 --- a/skate/set/set_test.go +++ b/skate/set/set_test.go @@ -22,9 +22,9 @@ func TestSet(t *testing.T) { r := make(Set) r.Add("2") - is.True(s.Intersection(&r).IsEmpty()) - is.Equal(s.Union(&r).Len(), 2) - is.Equal(s.Union(&r).SortedSlice(), []string{"1", "2"}) + is.True(s.Intersection(r).IsEmpty()) + is.Equal(s.Union(r).Len(), 2) + is.Equal(s.Union(r).Sorted(), []string{"1", "2"}) r.Add("3") r.Add("4") @@ -35,7 +35,7 @@ func TestSet(t *testing.T) { top := make(Set) top.Add("2") top.Add("3") - is.Equal(r.TopK(2), &top) + is.Equal(r.TopK(2), top) r.Clear() is.Equal(r.Len(), 0) diff --git a/skate/unstructured.go b/skate/unstructured.go new file mode 100644 index 0000000..082c685 --- /dev/null +++ b/skate/unstructured.go @@ -0,0 +1,66 @@ +package skate + +import ( + "regexp" + "strings" +) + +var ( + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`) + PatArxivPDF = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + PatArxivAbs = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`) + + urlPrefixes = []string{ + "http://doi.org/", + "https://doi.org/", + "http://dx.doi.org/", + "https://dx.doi.org/", + } +) + +// ParseUnstructured will in-place augment missing DOI, arxiv id and so on. +func ParseUnstructured(ref *Ref) error { + var ( + uns = ref.Biblio.Unstructured + v string + vs []string + ) + // Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5, + // 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ... + if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" { + parts := strings.Split(strings.ToLower(ref.Key), "-bib") + ref.Biblio.DOI = parts[0] + } + // DOI + v = PatDOI.FindString(uns) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in Key + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // DOI in URL + for _, prefix := range urlPrefixes { + if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) { + ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1) + } + } + v = PatDOINoHyphen.FindString(ref.Key) + if v != "" && ref.Biblio.DOI == "" { + ref.Biblio.DOI = v + } + // Arxiv + vs = PatArxivPDF.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } else { + vs = PatArxivAbs.FindStringSubmatch(uns) + if len(vs) != 0 && ref.Biblio.ArxivId == "" { + ref.Biblio.ArxivId = vs[1] + } + } + return nil +} diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go new file mode 100644 index 0000000..41ff471 --- /dev/null +++ b/skate/unstructured_test.go @@ -0,0 +1,54 @@ +package skate + +import ( + "reflect" + "testing" +) + +func TestParseUnstructured(t *testing.T) { + // XXX: add more cases, maybe move this into files. + var cases = []struct { + ref *Ref + result *Ref + err error + }{ + { + &Ref{ + Biblio: Biblio{ + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + { + &Ref{ + Biblio: Biblio{ + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + &Ref{ + Biblio: Biblio{ + ArxivId: "0808.3320", + DOI: "10.1111/j.1550-7408.1968.tb02138.x-BIB5", + Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5", + }, + }, + nil, + }, + } + for _, c := range cases { + err := ParseUnstructured(c.ref) + if err != c.err { + t.Fatalf("got %v, want %v", err, c.err) + } + if !reflect.DeepEqual(c.ref, c.result) { + t.Fatalf("got %#v, want %#v", c.ref, c.result) + } + } +} diff --git a/skate/verify.go b/skate/verify.go index 914f6a4..e6ab03e 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -505,7 +505,7 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult { return MatchResult{StatusStrong, ReasonVersionedDOI} } if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 { - getRelatedDOI := func(rel *Release) *set.Set { + getRelatedDOI := func(rel *Release) set.Set { ss := set.New() for _, rel := range rel.Extra.DataCite.Relations { if strings.ToLower(rel.RelatedIdentifierType) != "doi" { @@ -737,7 +737,7 @@ func parsePageString(s string) *ParsedPages { // averageScore take a limited set of authors and calculates pairwise // similarity scores, then returns the average of the best scores; between 0 // and 1. -func averageScore(a, b *set.Set) float64 { +func averageScore(a, b set.Set) float64 { aTrimmed := a.TopK(5) bTrimmed := b.TopK(5) maxScores := make(map[string]float64) // For each a, keep the max. diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go index a9f5c04..eb3dc55 100644 --- a/skate/zipkey/zipkey.go +++ b/skate/zipkey/zipkey.go @@ -1,3 +1,5 @@ +// Package zipkey implements ZipRun, a type that allows to attach a callback to +// a group of elements taken from two streams. package zipkey import ( @@ -14,7 +16,7 @@ type Group struct { } type ( - keyFunc func(string) (string, error) + keyFunc func(string) (string, error) // Given a line, extract the key. groupFunc func(*Group) error ) |