From 0c7485486eaa62e8b7673949e09d546b78649ab8 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 03:29:27 +0200
Subject: fix newline handling

---
 skate/map.go      | 21 ++++++++++++++++-----
 skate/map_test.go | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/skate/map.go b/skate/map.go
index 9d3c98d..094d3e2 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -40,14 +40,25 @@ type PartialDoc struct {
 // doc). We want fields, but we do not want to bake in TSV into each function.
 type Mapper func([]byte) ([][]byte, error)
 
-// AsTSV serializes the result of a field mapper as TSV. This is a slim adapter,
-// e.g. to parallel.Processor, which expects this function signature.
+// AsTSV serializes the result of a field mapper as TSV. This is a slim
+// adapter, e.g. to parallel.Processor, which expects this function signature.
+// If the last byte of the last field is not a newline, it will be appended.
 func (f Mapper) AsTSV(p []byte) ([]byte, error) {
-	fields, err := f(p)
-	if err != nil {
+	var (
+		fields [][]byte
+		err    error
+	)
+	if fields, err = f(p); err != nil {
 		return nil, err
 	}
-	return bytes.Join(fields, bTab), nil
+	if len(fields) == 0 {
+		return nil, nil
+	}
+	b := bytes.Join(fields, bTab)
+	if len(b) > 0 && !bytes.HasSuffix(b, bNewline) {
+		b = append(b, bNewline...)
+	}
+	return b, nil
 }
 
 // WithPrefix adds a given prefix to the first element.
diff --git a/skate/map_test.go b/skate/map_test.go
index a439d33..a81cb3d 100644
--- a/skate/map_test.go
+++ b/skate/map_test.go
@@ -149,6 +149,46 @@ func TestMapperTitleSandcrawler(t *testing.T) {
 	}
 }
 
+func TestAsTSV(t *testing.T) {
+	var cases = []struct {
+		f    Mapper
+		err  error
+		want string
+	}{
+		{
+			f: Mapper(func(_ []byte) ([][]byte, error) {
+				return [][]byte{
+					[]byte("a"),
+					[]byte("b"),
+					[]byte("c"),
+				}, nil
+			}),
+			err:  nil,
+			want: "a\tb\tc\n",
+		},
+		{
+			f: Mapper(func(_ []byte) ([][]byte, error) {
+				return [][]byte{
+					[]byte("a"),
+					[]byte("b"),
+					[]byte("c\n"),
+				}, nil
+			}),
+			err:  nil,
+			want: "a\tb\tc\n",
+		},
+	}
+	for _, c := range cases {
+		got, err := c.f.AsTSV([]byte{})
+		if err != c.err {
+			t.Fatalf("got %v, want nil", got)
+		}
+		if string(got) != c.want {
+			t.Fatalf("got %v, want %v", string(got), c.want)
+		}
+	}
+}
+
 func prettySlice(p [][]byte) (result []string) {
 	result = make([]string, len(p))
 	for i, v := range p {
-- 
cgit v1.2.3


From 3d61eac8c023a7f9509e0371baef40c00b0132f2 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 03:38:16 +0200
Subject: update docs

---
 skate/cmd/skate-map/main.go | 4 ++--
 skate/map.go                | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index ee02875..2517878 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -21,8 +21,8 @@
 //     be skipped, if we limit number of splits)
 // (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
 // (4) we want no "null" output
-// (5) tostring prints input as string, because we need to carry the document forward
-// (6) but we need some cleanup, too
+// (5) tostring prints the input as string, because we need to carry the document forward ...
+// (6) ... but we'll need some cleanup, too
 // (7) we normalize the DOI to lowercase
 // (8) a custom filter to normalize a DOI in a specific column
 // (9) sorting by DOI
diff --git a/skate/map.go b/skate/map.go
index 094d3e2..1d6bc0b 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -42,11 +42,12 @@ type Mapper func([]byte) ([][]byte, error)
 
 // AsTSV serializes the result of a field mapper as TSV. This is a slim
 // adapter, e.g. to parallel.Processor, which expects this function signature.
-// If the last byte of the last field is not a newline, it will be appended.
+// A newline will be appended, if not there already.
 func (f Mapper) AsTSV(p []byte) ([]byte, error) {
 	var (
 		fields [][]byte
 		err    error
+		b      []byte
 	)
 	if fields, err = f(p); err != nil {
 		return nil, err
@@ -54,14 +55,14 @@ func (f Mapper) AsTSV(p []byte) ([]byte, error) {
 	if len(fields) == 0 {
 		return nil, nil
 	}
-	b := bytes.Join(fields, bTab)
+	b = bytes.Join(fields, bTab)
 	if len(b) > 0 && !bytes.HasSuffix(b, bNewline) {
 		b = append(b, bNewline...)
 	}
 	return b, nil
 }
 
-// WithPrefix adds a given prefix to the first element.
+// WithPrefix is a "mapper middleware", adding a given prefix to the first field.
 func WithPrefix(f Mapper, prefix string) Mapper {
 	return func(p []byte) ([][]byte, error) {
 		fields, err := f(p)
-- 
cgit v1.2.3


From e717cee6c561891a5de2836821d793086806db9b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 04:16:02 +0200
Subject: update docs

---
 skate/map.go           | 3 ++-
 skate/zipkey/zipkey.go | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/skate/map.go b/skate/map.go
index 1d6bc0b..90d8c05 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -25,10 +25,11 @@ type TitleDoc struct {
 }
 
 // PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699
-// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on.
+// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX
 type PartialDoc struct {
 	ContainerName string `json:"container_name"`
 	Contribs      []struct {
+		// XXX: Need a way to sensibly compare sets of author names.
 		RawName string `json:"raw_name"`
 	} `json:"contribs"`
 	Volume       string `json:"volume"`
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index a9f5c04..9394734 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -14,7 +14,7 @@ type Group struct {
 }
 
 type (
-	keyFunc   func(string) (string, error)
+	keyFunc   func(string) (string, error) // Given a line, extract the key.
 	groupFunc func(*Group) error
 )
 
-- 
cgit v1.2.3


From 6628731b1531435ceb4151ed87cf483ee3134119 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 18:34:00 +0200
Subject: wip: update README

---
 skate/README.md | 84 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/skate/README.md b/skate/README.md
index 11f294b..8c05c67 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -1,35 +1,48 @@
 # skate
 
-This suite of command line tools have been written for various parts of the
-citation graph pipeline.
+This a small library and suite of command line tools related to generating a
+citation graph.
+
+## Why?
 
 Python was a bit too slow, even when parallelized, e.g. for generating clusters
 of similar documents or to do verification. An option for the future would be
 to resort to [Cython](https://cython.org/). Parts of
-[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported to Go for
-performance.
+[fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported into this
+project for performance.
 
 ![](static/zipkey.png)
 
-## Tools
+## Core Utils
+
+* `skate-derive-key`, `skate-map`
+* `skate-cluster`
+* `skate-verify-*`
 
-### skate-wikipedia-doi
 
-TSV (page title, DOI, doc) from wikipedia refs.
+The `skate-derive-key` tool derives a key from release entity JSON documents.
 
 ```
-$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi
-Rational point  10.1515/crll.1988.386.32        {"type_of_citation" ...
-Cubic surface   10.2140/ant.2007.1.393          {"type_of_citation" ...
+$ skate-derive-key < release_entities.jsonlines > docs.tsv
+```
+
+Result will be a three column TSV (ident, key, doc).
+
 ```
+---- ident --------------- ---- key --------- ---- doc ----------
 
-### skate-bref-id
+4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],...
+```
 
-Temporary helper to add a key to a biblioref document.
+After this step:
 
-### skate-cluster
+* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...`
+* cluster, e.g. `skate-cluster ...`
 
-Converts a sorted key output into a jsonlines clusters.
+----
+
+The `skate-cluster` tool converts a sorted key output into a jsonlines
+clusters.
 
 For example, this:
 
@@ -42,46 +55,37 @@ would turn into (a single line containing all docs with the same key).
 
 A single line cluster is easier to parallelize (e.g. for verification, etc.).
 
-### skate-derive-key
+----
 
-skate-derive-key derives a key from release entity JSON documents.
+The `skate-verify-*` tools run various matching and verification algorithms.
 
-```
-$ skate-derive-key < release_entities.jsonlines > docs.tsv
-```
+## Extra
 
-Result will be a three column TSV (ident, key, doc).
+* skate-wikipedia-doi
 
-```
----- ident --------------- ---- key --------- ---- doc ----------
+> TSV (page title, DOI, doc) from wikipedia refs.
 
-4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsus {"abstracts":[],...
+```
+$ parquet-tools cat --json minimal_dataset.parquet | skate-wikipedia-doi
+Rational point  10.1515/crll.1988.386.32        {"type_of_citation" ...
+Cubic surface   10.2140/ant.2007.1.393          {"type_of_citation" ...
 ```
 
-After this step:
-
-* sort by key, e.g. `LC_ALL=C sort -k2,2 -S 35% --parallel 6 --compress-program pzstd ...`
-* cluster, e.g. `skate-cluster ...`
-
-### skate-from-unstructured
-
-Takes a refs file and plucks out identifiers from unstructured field.
-
-### skate-ref-to-release
+* skate-bref-id
 
-Converts a ref document to a release. Part of first run, merging refs and releases.
+> Temporary helper to add a key to a biblioref document.
 
-### skate-to-doi
+* skate-from-unstructured
 
-Sanitize DOI in tabular file.
+> Takes a refs file and plucks out identifiers from unstructured field.
 
-### skate-verify
+* skate-ref-to-release
 
-Run various matching and verification algorithms.
+> Converts a ref document to a release. Part of first run, merging refs and releases.
 
-### skate-map
+* skate-to-doi
 
-A more generic version of derive key.
+> Sanitize DOI in tabular file.
 
 ## Misc
 
-- 
cgit v1.2.3


From 403df865aa7f86ce10e19eb8a7ed67bf772c901b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 22:41:01 +0200
Subject: update README

---
 skate/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/skate/README.md b/skate/README.md
index 8c05c67..7effb89 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -1,6 +1,6 @@
 # skate
 
-This a small library and suite of command line tools related to generating a
+A small library and suite of command line tools related to generating a
 citation graph.
 
 ## Why?
@@ -19,7 +19,6 @@ project for performance.
 * `skate-cluster`
 * `skate-verify-*`
 
-
 The `skate-derive-key` tool derives a key from release entity JSON documents.
 
 ```
-- 
cgit v1.2.3


From 77ca4cd924993188e0e9f8dd072af9f173eaad91 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 22:43:07 +0200
Subject: rename skate-ref-to-release to skate-conv

---
 python/refcat/tasks.py                 |  2 +-
 skate/Makefile                         |  2 +-
 skate/README.md                        |  5 +-
 skate/cmd/skate-conv/main.go           | 99 ++++++++++++++++++++++++++++++++++
 skate/cmd/skate-ref-to-release/main.go | 81 ----------------------------
 5 files changed, 104 insertions(+), 85 deletions(-)
 create mode 100644 skate/cmd/skate-conv/main.go
 delete mode 100644 skate/cmd/skate-ref-to-release/main.go

diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index df2245f..bb2685d 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -965,7 +965,7 @@ class RefsToRelease(Refcat):
     def run(self):
         output = shellout("""
                           zstdcat -T0 {input} |
-                          skate-ref-to-release -w 24 -b 100000 |
+                          skate-conv -f ref -w 24 -b 100000 |
                           zstd -T0 -c > {output}
                           """,
                           input=self.input().path)
diff --git a/skate/Makefile b/skate/Makefile
index 9bc70c2..255bc28 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
 SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
+TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
 PKGNAME := skate
 
 .PHONY: test
diff --git a/skate/README.md b/skate/README.md
index 7effb89..d3a361c 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -78,9 +78,10 @@ Cubic surface   10.2140/ant.2007.1.393          {"type_of_citation" ...
 
 > Takes a refs file and plucks out identifiers from unstructured field.
 
-* skate-ref-to-release
+* skate-conv
 
-> Converts a ref document to a release. Part of first run, merging refs and releases.
+> Converts a ref (or open library) document to a release. Part of first step,
+> merging refs and releases.
 
 * skate-to-doi
 
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
new file mode 100644
index 0000000..647472e
--- /dev/null
+++ b/skate/cmd/skate-conv/main.go
@@ -0,0 +1,99 @@
+// skate-conv converts various schemas into releases. This should replace the
+// very specific skate-ref-to-release and the like.
+//
+// $ skate-conv -f ref < FILE > FILE
+//
+// Currently source schemas: "ref", "ol", "rg"
+package main
+
+import (
+	"flag"
+	"log"
+	"os"
+	"runtime"
+	"strings"
+
+	"git.archive.org/martin/cgraph/skate"
+	"git.archive.org/martin/cgraph/skate/parallel"
+	json "github.com/segmentio/encoding/json"
+)
+
+var (
+	numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+	batchSize  = flag.Int("b", 100000, "batch size")
+	fromFormat = flag.String("f", "ref", "import schema")
+
+	bytesNewline = []byte("\n")
+	f            func([]byte) ([]byte, error)
+)
+
+func main() {
+	flag.Parse()
+	switch *fromFormat {
+	case "ref":
+		f = refToRelease
+	case "rg":
+		f = rgSitemapToRelease
+	case "ol":
+		f = openLibraryToRelease
+	}
+	pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
+	pp.NumWorkers = *numWorkers
+	pp.BatchSize = *batchSize
+	if err := pp.Run(); err != nil {
+		log.Fatal(err)
+	}
+}
+
+// refToRelease converts a ref document to a release.
+func refToRelease(p []byte) ([]byte, error) {
+	var ref skate.Ref
+	if err := json.Unmarshal(p, &ref); err != nil {
+		return nil, err
+	}
+	release, err := skate.RefToRelease(&ref)
+	if err != nil {
+		return nil, err
+	}
+	release.Extra.Skate.Status = "ref" // means: converted from ref
+	release.Extra.Skate.Ref.Index = ref.Index
+	release.Extra.Skate.Ref.Key = ref.Key
+	b, err := json.Marshal(release)
+	b = append(b, bytesNewline...)
+	return b, err
+}
+
+func rgSitemapToRelease(p []byte) ([]byte, error) {
+	var (
+		s       skate.Sitemap
+		release skate.Release
+	)
+	if err := json.Unmarshal(p, &s); err != nil {
+		return nil, err
+	}
+	release.Title = s.Title
+	if len(s.URL) > 41 {
+		// XXX: A pseudo ident, maybe irritating.
+		release.Ident = strings.Split(s.URL[41:], "_")[0]
+	}
+	release.Extra.Skate.Status = "rg"
+	release.Extra.Skate.ResearchGate.URL = s.URL
+	b, err := json.Marshal(release)
+	b = append(b, bytesNewline...)
+	return b, err
+}
+
+func openLibraryToRelease(p []byte) ([]byte, error) {
+	var w skate.OpenLibraryWork
+	if err := json.Unmarshal(p, &w); err != nil {
+		return nil, err
+	}
+	release, err := skate.OpenLibraryToRelease(&w)
+	if err != nil {
+		return nil, err
+	}
+	release.Extra.Skate.Status = "ol"
+	b, err := json.Marshal(release)
+	b = append(b, bytesNewline...)
+	return b, err
+}
diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-ref-to-release/main.go
deleted file mode 100644
index d547e62..0000000
--- a/skate/cmd/skate-ref-to-release/main.go
+++ /dev/null
@@ -1,81 +0,0 @@
-// skate-ref-to-release converts a "ref" document to a "release" document.
-//
-package main
-
-import (
-	"flag"
-	"log"
-	"os"
-	"runtime"
-	"strings"
-
-	"git.archive.org/martin/cgraph/skate"
-	"github.com/miku/parallel"
-
-	json "github.com/segmentio/encoding/json"
-)
-
-var (
-	numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
-	batchSize  = flag.Int("b", 100000, "batch size")
-	fromFormat = flag.String("f", "ref", "import data shape")
-
-	bytesNewline = []byte("\n")
-)
-
-func refToRelease(p []byte) ([]byte, error) {
-	var ref skate.Ref
-	if err := json.Unmarshal(p, &ref); err != nil {
-		return nil, err
-	}
-	release, err := skate.RefToRelease(&ref)
-	if err != nil {
-		return nil, err
-	}
-	release.Extra.Skate.Status = "ref" // means: converted from ref
-	release.Extra.Skate.Ref.Index = ref.Index
-	release.Extra.Skate.Ref.Key = ref.Key
-	b, err := json.Marshal(release)
-	b = append(b, bytesNewline...)
-	return b, err
-}
-
-func rgSitemapToRelease(p []byte) ([]byte, error) {
-	var (
-		s       skate.Sitemap
-		release skate.Release
-	)
-	if err := json.Unmarshal(p, &s); err != nil {
-		return nil, err
-	}
-	release.Title = s.Title
-	if len(s.URL) > 41 {
-		// XXX: A pseudo ident, maybe irritating.
-		release.Ident = strings.Split(s.URL[41:], "_")[0]
-	}
-	release.Extra.Skate.Status = "rg"
-	release.Extra.Skate.ResearchGate.URL = s.URL
-	b, err := json.Marshal(release)
-	b = append(b, bytesNewline...)
-	return b, err
-}
-
-func main() {
-	flag.Parse()
-	switch *fromFormat {
-	case "ref":
-		pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease)
-		pp.NumWorkers = *numWorkers
-		pp.BatchSize = *batchSize
-		if err := pp.Run(); err != nil {
-			log.Fatal(err)
-		}
-	case "rg":
-		pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease)
-		pp.NumWorkers = *numWorkers
-		pp.BatchSize = *batchSize
-		if err := pp.Run(); err != nil {
-			log.Fatal(err)
-		}
-	}
-}
-- 
cgit v1.2.3


From ae61e8918f585b27dbcc51fd6c7d622b6650520b Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 22:45:41 +0200
Subject: update deps

---
 skate/go.mod | 1 -
 skate/go.sum | 2 --
 2 files changed, 3 deletions(-)

diff --git a/skate/go.mod b/skate/go.mod
index 49ef5d2..4d0ffed 100644
--- a/skate/go.mod
+++ b/skate/go.mod
@@ -6,7 +6,6 @@ require (
 	github.com/elastic/go-elasticsearch v0.0.0
 	github.com/elastic/go-elasticsearch/v7 v7.12.0
 	github.com/matryer/is v1.4.0
-	github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c
 	github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e
 	github.com/segmentio/encoding v0.2.17
 	github.com/tidwall/gjson v1.7.5
diff --git a/skate/go.sum b/skate/go.sum
index a186bcd..bb3392d 100644
--- a/skate/go.sum
+++ b/skate/go.sum
@@ -6,8 +6,6 @@ github.com/klauspost/cpuid/v2 v2.0.5 h1:qnfhwbFriwDIX51QncuNU5mEMf+6KE3t7O8V2KQl
 github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE=
 github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
-github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c h1:w1k+oAL6cD9oNI2LXgyCHXKJzgD7WXn/09+cdkMgZJ4=
-github.com/miku/parallel v0.0.0-20210205190127-d1fa15dcea0c/go.mod h1:m4hVixrXwk3DUp5cQ1j661BsHpjqSc/SfXE0uUMxmAw=
 github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY=
 github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e/go.mod h1:uFMI8w+ref4v2r9jz+c9i1IfIttS/OkmLfrk1jne5hs=
 github.com/segmentio/encoding v0.2.17 h1:cgfmPc44u1po1lz5bSgF00gLCROBjDNc7h+H7I20zpc=
-- 
cgit v1.2.3


From be4c76e139551f56be9b7bcb96997904ed161075 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 22:45:49 +0200
Subject: update ignore files

---
 skate/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skate/.gitignore b/skate/.gitignore
index 5ede85f..32a9ec1 100644
--- a/skate/.gitignore
+++ b/skate/.gitignore
@@ -14,7 +14,6 @@
 # Dependency directories (remove the comment below to include it)
 # vendor/
 #
-/skate-ref-to-release
 /skate-derive-key
 /skate-cluster
 /skate-verify
@@ -26,3 +25,4 @@ packaging/debian/skate/usr
 skate_*_amd64.deb
 /skate-dot
 /skate-map
+/skate-conv
-- 
cgit v1.2.3


From 45eed4462d234f8502e38b0e98e205e341188072 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 23:23:32 +0200
Subject: implement a few flags as mapper middleware

---
 skate/cmd/skate-map/main.go | 40 ++++++++++++++++++++++++++++++----------
 skate/map.go                | 25 +++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 2517878..67fc62b 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -2,6 +2,10 @@
 // extract a key from a json document. For simple cases, you can use `jq` and
 // other tools.  Some key derivations require a bit more.
 //
+// This tool helps us to find similar things in billions of items by mapping
+// docs to key. All docs that share a key are considered match candidates and can be
+// post-processed, e.g. to verify matches or to generate output schemas.
+//
 // An example with mostly unix tools. We want to extract the DOI and sort by
 // it; we also want to do this fast, hence parallel, LC_ALL, etc.
 //
@@ -29,7 +33,6 @@
 //
 // This is reasonably fast, but some cleanup is ugly. We also want more complex
 // keys, e.g. more normalizations, etc.  We'd like to encapsulate (2) to (8).
-
 package main
 
 import (
@@ -45,12 +48,15 @@ import (
 )
 
 var (
-	mapperName = flag.String("m", "", "mapper to run")
-	numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
-	batchSize  = flag.Int("b", 50000, "batch size")
-	verbose    = flag.Bool("verbose", false, "show progress")
-	keyPrefix  = flag.String("p", "", "a key prefix to use")
-	extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+	mapperName  = flag.String("m", "", "mapper to run")
+	numWorkers  = flag.Int("w", runtime.NumCPU(), "number of workers")
+	batchSize   = flag.Int("b", 50000, "batch size")
+	verbose     = flag.Bool("verbose", false, "show progress")
+	keyPrefix   = flag.String("p", "", "a key prefix to use")
+	extraValue  = flag.String("x", "", "extra value to pass to configurable mappers")
+	bestEffort  = flag.Bool("B", false, "best effort")
+	logFile     = flag.String("log", "", "log filename")
+	skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given field, zero indexed")
 )
 
 func main() {
@@ -67,15 +73,29 @@ func main() {
 		"ty": skate.MapperTitleNysiis,
 		"ts": skate.MapperTitleSandcrawler,
 	}
+	if *logFile != "" {
+		f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer f.Close()
+		log.SetOutput(f)
+	}
 	switch {
 	case *mapperName != "":
-		if f, ok := availableMappers[*mapperName]; !ok {
+		if mapf, ok := availableMappers[*mapperName]; !ok {
 			log.Fatalf("unknown mapper name: %v", *mapperName)
 		} else {
+			if *skipOnEmpty >= 0 {
+				mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty)
+			}
 			if *keyPrefix != "" {
-				f = skate.WithPrefix(f, *keyPrefix)
+				mapf = skate.WithPrefix(mapf, *keyPrefix)
+			}
+			if *bestEffort {
+				mapf = skate.WithBestEffort(mapf)
 			}
-			pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV)
+			pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV)
 			pp.NumWorkers = *numWorkers
 			pp.BatchSize = *batchSize
 			pp.Verbose = *verbose
diff --git a/skate/map.go b/skate/map.go
index 90d8c05..d6e37be 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -78,6 +78,31 @@ func WithPrefix(f Mapper, prefix string) Mapper {
 	}
 }
 
+// WithBestEffort will not fail on an error.
+func WithBestEffort(f Mapper) Mapper {
+	return func(p []byte) ([][]byte, error) {
+		if fields, err := f(p); err != nil {
+			return nil, nil
+		} else {
+			return fields, err
+		}
+	}
+}
+
+// WithSkipOnEmpty ignores results where the value at a given field is empty.
+func WithSkipOnEmpty(f Mapper, index int) Mapper {
+	return func(p []byte) ([][]byte, error) {
+		fields, err := f(p)
+		if err != nil {
+			return nil, err
+		}
+		if index < len(fields) && len(fields[index]) == 0 {
+			return nil, nil
+		}
+		return fields, err
+	}
+}
+
 // NameOf returns name of value, e.g. the name of a function.
 func NameOf(f interface{}) string {
 	v := reflect.ValueOf(f)
-- 
cgit v1.2.3


From 737a51adaf021a2bd9aaa4d53dc3c564503a11b6 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 30 Apr 2021 23:50:44 +0200
Subject: fix a typo

---
 skate/schema.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skate/schema.go b/skate/schema.go
index d58d1e8..14397e9 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -65,7 +65,7 @@ func RefToRelease(ref *Ref) (*Release, error) {
 	return &release, nil
 }
 
-// parseIsbn tries to find and validate ISBN from unstrucuted data.
+// parseIsbn tries to find and validate ISBN from unstructured data.
 func parseIsbn(s string) []string {
 	// ISBN: 10: 0137822693, pp: 373
 	// Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec,
-- 
cgit v1.2.3


From 528b7c0d9f07cfb0ef7a7db5dfe4e3d61b9faa6f Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 1 May 2021 00:44:48 +0200
Subject: map is a reference type

---
 skate/schema.go       |  2 +-
 skate/set/set.go      | 94 +++++++++++++++++++++++++++------------------------
 skate/set/set_test.go |  8 ++---
 skate/verify.go       |  4 +--
 4 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/skate/schema.go b/skate/schema.go
index 14397e9..a9570b7 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -80,7 +80,7 @@ func parseIsbn(s string) []string {
 		u            []rune
 		z            string
 	)
-	valid := setPool.Get().(*set.Set)
+	valid := setPool.Get().(set.Set)
 	valid.Clear()
 	defer setPool.Put(valid)
 	for _, v := range append(candidates10, candidates13...) {
diff --git a/skate/set/set.go b/skate/set/set.go
index 6bad47e..29cd3ef 100644
--- a/skate/set/set.go
+++ b/skate/set/set.go
@@ -8,31 +8,47 @@ import (
 // Set implements basic string set operations, not thread-safe.
 type Set map[string]struct{}
 
-func (s *Set) Clear() {
-	for k := range *s {
-		delete(*s, k)
+// New creates a new set.
+func New() Set {
+	var s = make(Set)
+	return s
+}
+
+// FromSlice initializes a set from a slice.
+func FromSlice(vs []string) Set {
+	s := New()
+	for _, v := range vs {
+		s.Add(v)
+	}
+	return s
+}
+
+// Clear removes all elements.
+func (s Set) Clear() {
+	for k := range s {
+		delete(s, k)
 	}
 }
 
 // Add adds an element.
-func (s *Set) Add(v string) *Set {
-	(*s)[v] = struct{}{}
+func (s Set) Add(v string) Set {
+	s[v] = struct{}{}
 	return s
 }
 
 // Len returns number of elements in set.
-func (s *Set) Len() int {
-	return len(*s)
+func (s Set) Len() int {
+	return len(s)
 }
 
 // IsEmpty returns if set has zero elements.
-func (s *Set) IsEmpty() bool {
+func (s Set) IsEmpty() bool {
 	return s.Len() == 0
 }
 
 // Equals returns true, if sets contain the same elements.
-func (s *Set) Equals(t *Set) bool {
-	for k := range *s {
+func (s Set) Equals(t Set) bool {
+	for k := range s {
 		if !t.Contains(k) {
 			return false
 		}
@@ -41,13 +57,13 @@ func (s *Set) Equals(t *Set) bool {
 }
 
 // Contains returns membership status.
-func (s *Set) Contains(v string) bool {
-	_, ok := (*s)[v]
+func (s Set) Contains(v string) bool {
+	_, ok := (s)[v]
 	return ok
 }
 
 // Intersection returns a new set containing all elements found in both sets.
-func (s *Set) Intersection(t *Set) *Set {
+func (s Set) Intersection(t Set) Set {
 	u := New()
 	for _, v := range s.Slice() {
 		if t.Contains(v) {
@@ -58,7 +74,7 @@ func (s *Set) Intersection(t *Set) *Set {
 }
 
 // Union returns the union of two sets.
-func (s *Set) Union(t *Set) *Set {
+func (s Set) Union(t Set) Set {
 	u := New()
 	for _, v := range s.Slice() {
 		u.Add(v)
@@ -70,16 +86,16 @@ func (s *Set) Union(t *Set) *Set {
 }
 
 // Slice returns all elements as a slice.
-func (s *Set) Slice() (result []string) {
-	for k := range *s {
+func (s Set) Slice() (result []string) {
+	for k := range s {
 		result = append(result, k)
 	}
 	return
 }
 
-// SortedSlice returns all elements as a slice, sorted.
-func (s *Set) SortedSlice() (result []string) {
-	for k := range *s {
+// Sorted returns all elements as a slice, sorted.
+func (s Set) Sorted() (result []string) {
+	for k := range s {
 		result = append(result, k)
 	}
 	sort.Strings(result)
@@ -87,9 +103,9 @@ func (s *Set) SortedSlice() (result []string) {
 }
 
 // TopK returns at most k elements.
-func (s *Set) TopK(k int) *Set {
+func (s Set) TopK(k int) Set {
 	var top []string
-	for i, v := range s.SortedSlice() {
+	for i, v := range s.Sorted() {
 		if i < k {
 			top = append(top, v)
 		}
@@ -97,9 +113,10 @@ func (s *Set) TopK(k int) *Set {
 	return FromSlice(top)
 }
 
-func (s *Set) Product(t *Set) (result [][]string) {
-	for k := range *s {
-		for l := range *t {
+// Product returns a slice of pairs, representing the cartesian product.
+func (s Set) Product(t Set) (result [][]string) {
+	for k := range s {
+		for l := range t {
 			result = append(result, []string{k, l})
 		}
 	}
@@ -107,7 +124,7 @@ func (s *Set) Product(t *Set) (result [][]string) {
 }
 
 // Jaccard returns the jaccard index of sets s and t.
-func (s *Set) Jaccard(t *Set) float64 {
+func (s Set) Jaccard(t Set) float64 {
 	if s.IsEmpty() && t.IsEmpty() {
 		return 1
 	}
@@ -118,12 +135,13 @@ func (s *Set) Jaccard(t *Set) float64 {
 	}
 }
 
-func (s *Set) Join(sep string) string {
+// Join joins elements from a set with given separator.
+func (s Set) Join(sep string) string {
 	return strings.Join(s.Slice(), sep)
 }
 
 // Max returns the size of the largest set.
-func Max(ss ...*Set) (max int) {
+func Max(ss ...Set) (max int) {
 	for _, s := range ss {
 		if s.Len() > max {
 			max = s.Len()
@@ -133,7 +151,7 @@ func Max(ss ...*Set) (max int) {
 }
 
 // Min returns the size of the smallest set.
-func Min(ss ...*Set) (min int) {
+func Min(ss ...Set) (min int) {
 	min = 2 << 30
 	for _, s := range ss {
 		if s.Len() < min {
@@ -143,27 +161,13 @@ func Min(ss ...*Set) (min int) {
 	return
 }
 
-func Filter(s *Set, f func(string) bool) *Set {
+// Filter returns a set containing all elements, which satisfy a given predicate.
+func Filter(s Set, f func(string) bool) Set {
 	t := New()
-	for v := range *s {
+	for v := range s {
 		if f(v) {
 			t.Add(v)
 		}
 	}
 	return t
 }
-
-// New creates a new set.
-func New() *Set {
-	s := make(Set)
-	return &s
-}
-
-// FromSlice initializes a set from a slice.
-func FromSlice(vs []string) *Set {
-	s := New()
-	for _, v := range vs {
-		s.Add(v)
-	}
-	return s
-}
diff --git a/skate/set/set_test.go b/skate/set/set_test.go
index 403b6df..dffb3e3 100644
--- a/skate/set/set_test.go
+++ b/skate/set/set_test.go
@@ -22,9 +22,9 @@ func TestSet(t *testing.T) {
 
 	r := make(Set)
 	r.Add("2")
-	is.True(s.Intersection(&r).IsEmpty())
-	is.Equal(s.Union(&r).Len(), 2)
-	is.Equal(s.Union(&r).SortedSlice(), []string{"1", "2"})
+	is.True(s.Intersection(r).IsEmpty())
+	is.Equal(s.Union(r).Len(), 2)
+	is.Equal(s.Union(r).Sorted(), []string{"1", "2"})
 
 	r.Add("3")
 	r.Add("4")
@@ -35,7 +35,7 @@ func TestSet(t *testing.T) {
 	top := make(Set)
 	top.Add("2")
 	top.Add("3")
-	is.Equal(r.TopK(2), &top)
+	is.Equal(r.TopK(2), top)
 
 	r.Clear()
 	is.Equal(r.Len(), 0)
diff --git a/skate/verify.go b/skate/verify.go
index 914f6a4..e6ab03e 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -505,7 +505,7 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult {
 		return MatchResult{StatusStrong, ReasonVersionedDOI}
 	}
 	if len(a.Extra.DataCite.Relations) > 0 || len(b.Extra.DataCite.Relations) > 0 {
-		getRelatedDOI := func(rel *Release) *set.Set {
+		getRelatedDOI := func(rel *Release) set.Set {
 			ss := set.New()
 			for _, rel := range rel.Extra.DataCite.Relations {
 				if strings.ToLower(rel.RelatedIdentifierType) != "doi" {
@@ -737,7 +737,7 @@ func parsePageString(s string) *ParsedPages {
 // averageScore take a limited set of authors and calculates pairwise
 // similarity scores, then returns the average of the best scores; between 0
 // and 1.
-func averageScore(a, b *set.Set) float64 {
+func averageScore(a, b set.Set) float64 {
 	aTrimmed := a.TopK(5)
 	bTrimmed := b.TopK(5)
 	maxScores := make(map[string]float64) // For each a, keep the max.
-- 
cgit v1.2.3


From 55647ea29aff9a942816e7d858c37d7e37e598da Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 1 May 2021 01:19:49 +0200
Subject: update docs

---
 skate/cmd/skate-dot/main.go | 4 ++--
 skate/zipkey/zipkey.go      | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go
index 3ef99d5..5c11975 100644
--- a/skate/cmd/skate-dot/main.go
+++ b/skate/cmd/skate-dot/main.go
@@ -1,5 +1,5 @@
-// skate-dot generates dot files from inbound and outbound citation links. Just
-// a demo, replacement for a couple python scripts.
+// [wip] skate-dot generates dot files from inbound and outbound citation
+// links. Just a demo, replacement for a couple python scripts.
 package main
 
 import (
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index 9394734..eb3dc55 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -1,3 +1,5 @@
+// Package zipkey implements ZipRun, a type that allows to attach a callback to
+// a group of elements taken from two streams.
 package zipkey
 
 import (
-- 
cgit v1.2.3


From 8599a6c01b93f5590e77c2fcd1f41c1e170f1575 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 1 May 2021 01:21:44 +0200
Subject: update README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 434aa6f..528dd7d 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,8 @@ Context: [fatcat](https://fatcat.wiki), "Mellon Grant" (20/21).
 
 We use informal, internal versioning, currently v2, next will be v3.
 
+![](https://i.imgur.com/6dSaW2q.png)
+
 # Grant related tasks
 
 3/4 phases of the grant contain citation graph related tasks.
-- 
cgit v1.2.3


From f5dafe6e3ceb588d7ab89bf3cbb11c5a579b6678 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 1 May 2021 14:23:06 +0200
Subject: start overview docs

---
 README.md         |  6 ++----
 notes/overview.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)
 create mode 100644 notes/overview.md

diff --git a/README.md b/README.md
index 528dd7d..b32e565 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,5 @@
 # cgraph
 
-----
-
 Scholarly citation graph related code; maintained by
 [martin@archive.org](mailto:martin@archive.org); multiple subprojects to keep
 all relevant code close.
@@ -10,9 +8,9 @@ all relevant code close.
   [shiv](https://github.com/linkedin/shiv) for single-file deployments)
 * skate: various Go command line tools (packaged as deb)
 
-Context: [fatcat](https://fatcat.wiki), "Mellon Grant" (20/21).
+Context: [fatcat](https://fatcat.wiki), "Mellon Grant" (20/21)
 
-We use informal, internal versioning, currently v2, next will be v3.
+We use informal, internal versioning for the graph currently v2, next will be v3.
 
 ![](https://i.imgur.com/6dSaW2q.png)
 
diff --git a/notes/overview.md b/notes/overview.md
new file mode 100644
index 0000000..8cb1200
--- /dev/null
+++ b/notes/overview.md
@@ -0,0 +1,52 @@
+# Overview
+
+## Data inputs
+
+Mostly JSON, but each one different in form and quality.
+
+Core inputs:
+
+* refs schema, from metadata or grobid (1-4B)
+* fatcat release entities (100-200M)
+* open library solr export (10-50M)
+
+Other inputs:
+
+* researchgate sitemap, titles (10-30M)
+* oai-pmh harvest metadata (50-200M)
+* sim (serials in microfilm, "microfilm") metadata
+
+Inputs related to evaluation:
+
+* BASE md dump (200-300M)
+* Microsoft Academic, MAG (100-300M)
+
+Casually:
+
+* a single title, e.g. ILL related (1)
+* lists of titles (1-1M)
+
+## Targets
+
+### BiblioRef
+
+Most important high level target; basic schema for current setup; elasticsearch
+indexable, small JSON docs, allowing basic aggregations and lookups.
+
+This is not just a conversion, but may involve clustering, verification, etc.
+
+## Approach
+
+We may call it "local map-reduce", and we try to do it all in a single MR setup, e.g.
+
+* extract relevant fields and sort (map)
+* apply computation on groups (reduce)
+
+As we want performance and sometimes custom code (e.g. for finding information
+in unstructured data), we try to group code into a Go library with a suite of
+command line tools. Easy to build and deploy.
+
+If the scaffoling is good, we can plug in mappers and reducers as we go, and
+expose them in the tools.
+
+
-- 
cgit v1.2.3


From c8cbfe24b9a8ab83d0536f9594412a86eedd992a Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sat, 1 May 2021 22:41:54 +0200
Subject: update deps

---
 skate/go.mod | 3 ++-
 skate/go.sum | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/skate/go.mod b/skate/go.mod
index 4d0ffed..57ae586 100644
--- a/skate/go.mod
+++ b/skate/go.mod
@@ -5,9 +5,10 @@ go 1.15
 require (
 	github.com/elastic/go-elasticsearch v0.0.0
 	github.com/elastic/go-elasticsearch/v7 v7.12.0
+	github.com/klauspost/cpuid/v2 v2.0.6 // indirect
 	github.com/matryer/is v1.4.0
 	github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e
 	github.com/segmentio/encoding v0.2.17
 	github.com/tidwall/gjson v1.7.5
-	golang.org/x/text v0.3.5
+	golang.org/x/text v0.3.6
 )
diff --git a/skate/go.sum b/skate/go.sum
index bb3392d..96d323d 100644
--- a/skate/go.sum
+++ b/skate/go.sum
@@ -2,8 +2,9 @@ github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWn
 github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg=
 github.com/elastic/go-elasticsearch/v7 v7.12.0 h1:j4tvcMrZJLp39L2NYvBb7f+lHKPqPHSL3nvB8+/DV+s=
 github.com/elastic/go-elasticsearch/v7 v7.12.0/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4=
-github.com/klauspost/cpuid/v2 v2.0.5 h1:qnfhwbFriwDIX51QncuNU5mEMf+6KE3t7O8V2KQl3Dg=
 github.com/klauspost/cpuid/v2 v2.0.5/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
+github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI=
+github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
 github.com/matryer/is v1.4.0 h1:sosSmIWwkYITGrxZ25ULNDeKiMNzFSr4V/eqBQP0PeE=
 github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
 github.com/nsf/jsondiff v0.0.0-20210303162244-6ea32392771e h1:S+/ptYdZtpK/MDstwCyt+ZHdXEpz86RJZ5gyZU4txJY=
@@ -16,6 +17,6 @@ github.com/tidwall/match v1.0.3 h1:FQUVvBImDutD8wJLN6c5eMzWtjgONK9MwIBCOrUJKeE=
 github.com/tidwall/match v1.0.3/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
 github.com/tidwall/pretty v1.1.0 h1:K3hMW5epkdAVwibsQEfR/7Zj0Qgt4DxtNumTq/VloO8=
 github.com/tidwall/pretty v1.1.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
-golang.org/x/text v0.3.5 h1:i6eZZ+zk0SOf0xgBpEpPD18qWcJda6q1sxt3S0kzyUQ=
-golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.6 h1:aRYxNxv6iGQlyVaZmk6ZgYEDa+Jg18DxebPSrd6bg1M=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-- 
cgit v1.2.3


From 74667c6b466932daeddb2ab66131dfae1a74cb97 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 4 May 2021 22:16:50 +0200
Subject: update README

---
 skate/README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/skate/README.md b/skate/README.md
index d3a361c..8e2d7d1 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -1,15 +1,18 @@
 # skate
 
 A small library and suite of command line tools related to generating a
-citation graph.
+[citation graph](https://en.wikipedia.org/wiki/Citation_graph).
 
-## Why?
+> There is no standard format for the citations in bibliographies, and the
+> record linkage of citations can be a time-consuming and complicated process.
 
-Python was a bit too slow, even when parallelized, e.g. for generating clusters
-of similar documents or to do verification. An option for the future would be
-to resort to [Cython](https://cython.org/). Parts of
+## Background
+
+Python was a bit too slow, even when parallelized (with GNU parallel), e.g. for
+generating clusters of similar documents or to do verification. An option for
+the future would be to resort to [Cython](https://cython.org/). Parts of
 [fuzzycat](https://git.archive.org/webgroup/fuzzycat) has been ported into this
-project for performance.
+project for performance (and we saw a 25x speedup for certain tasks).
 
 ![](static/zipkey.png)
 
-- 
cgit v1.2.3


From 223d1d5ba445c38c287da43c0599d2b2b03ecd87 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 4 May 2021 22:30:59 +0200
Subject: set: some tweaks

---
 skate/set/set.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/skate/set/set.go b/skate/set/set.go
index 29cd3ef..b762cb8 100644
--- a/skate/set/set.go
+++ b/skate/set/set.go
@@ -76,11 +76,11 @@ func (s Set) Intersection(t Set) Set {
 // Union returns the union of two sets.
 func (s Set) Union(t Set) Set {
 	u := New()
-	for _, v := range s.Slice() {
-		u.Add(v)
+	for k := range s {
+		u.Add(k)
 	}
-	for _, v := range t.Slice() {
-		u.Add(v)
+	for k := range t {
+		u.Add(k)
 	}
 	return u
 }
@@ -102,7 +102,7 @@ func (s Set) Sorted() (result []string) {
 	return
 }
 
-// TopK returns at most k elements.
+// TopK returns at most k sorted elements.
 func (s Set) TopK(k int) Set {
 	var top []string
 	for i, v := range s.Sorted() {
@@ -113,7 +113,7 @@ func (s Set) TopK(k int) Set {
 	return FromSlice(top)
 }
 
-// Product returns a slice of pairs, representing the cartesian product.
+// Product returns a slice of pairs, representing the cartesian product of two sets.
 func (s Set) Product(t Set) (result [][]string) {
 	for k := range s {
 		for l := range t {
@@ -123,7 +123,8 @@ func (s Set) Product(t Set) (result [][]string) {
 	return
 }
 
-// Jaccard returns the jaccard index of sets s and t.
+// Jaccard returns the jaccard index of sets s and t, between 0 and 1, where 1
+// means equality.
 func (s Set) Jaccard(t Set) float64 {
 	if s.IsEmpty() && t.IsEmpty() {
 		return 1
-- 
cgit v1.2.3


From 3a43e67238f5acc96a36265f78b70425d078d579 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 4 May 2021 22:48:47 +0200
Subject: update docs

---
 skate/README.md             |  2 +-
 skate/cmd/skate-map/main.go | 16 +++++++---------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/skate/README.md b/skate/README.md
index 8e2d7d1..68a3f64 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -18,7 +18,7 @@ project for performance (and we saw a 25x speedup for certain tasks).
 
 ## Core Utils
 
-* `skate-derive-key`, `skate-map`
+* `skate-derive-key`, will be: `skate-map`
 * `skate-cluster`
 * `skate-verify-*`
 
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 67fc62b..d5f22fd 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -1,13 +1,10 @@
-// skate-map runs a given map function over input data. We mostly want to
+// skate-map runs a given "map" function over input data. Here, we mostly want to
 // extract a key from a json document. For simple cases, you can use `jq` and
-// other tools.  Some key derivations require a bit more.
+// other tools. Some key derivations require a bit more, hence a dedicated program.
 //
-// This tool helps us to find similar things in billions of items by mapping
-// docs to key. All docs that share a key are considered match candidates and can be
-// post-processed, e.g. to verify matches or to generate output schemas.
-//
-// An example with mostly unix tools. We want to extract the DOI and sort by
-// it; we also want to do this fast, hence parallel, LC_ALL, etc.
+// An example with mostly unix tools. We want to extract the DOI from newline
+// delimited JSON and sort by it; we also want to do this fast, hence parallel,
+// LC_ALL, etc.
 //
 // $ zstdcat -T0 file.zst |                                  (1)
 //     LC_ALL=C tr -d '\t' |                                 (2) *
@@ -32,7 +29,8 @@
 // (9) sorting by DOI
 //
 // This is reasonably fast, but some cleanup is ugly. We also want more complex
-// keys, e.g. more normalizations, etc.  We'd like to encapsulate (2) to (8).
+// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2)
+// to (8) with `skate-map`.
 package main
 
 import (
-- 
cgit v1.2.3


From 6462e64ce8e61f54e1c3b1247c2039a2eddd5875 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 4 May 2021 23:18:28 +0200
Subject: skate-map: a bit more help output

---
 skate/cmd/skate-map/main.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index d5f22fd..227acf2 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -54,16 +54,18 @@ var (
 	extraValue  = flag.String("x", "", "extra value to pass to configurable mappers")
 	bestEffort  = flag.Bool("B", false, "best effort")
 	logFile     = flag.String("log", "", "log filename")
-	skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given field, zero indexed")
+	skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given column (zero indexed)")
+
+	help = `skate-map available mappers
+
+    $ skate-map -m ts < file.ndj > file.tsv
+	`
 )
 
 func main() {
 	flag.Parse()
-	// TODO
-	// [ ] add prefixes and a way to derive multiple keys in one go
-	// [ ] how to store multiple keys, sorted?
-	// [ ] maybe wrap jq and parallel for arbitrary nested keys
 	availableMappers := map[string]skate.Mapper{
+		// Add new mapper functions here.
 		"id": skate.Identity,
 		"ff": skate.CreateFixedMapper(*extraValue),
 		"ti": skate.MapperTitle,
@@ -102,8 +104,7 @@ func main() {
 			}
 		}
 	default:
-		fmt.Println("skate-map available mappers")
-		fmt.Println()
+		fmt.Println(help)
 		w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
 		for k, v := range availableMappers {
 			fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
-- 
cgit v1.2.3


From a63d76e3fc3c59c2eec2de4e538b45e41e1f8aa9 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 4 May 2021 23:59:53 +0200
Subject: tweaks; move parsing out of command

---
 skate/cmd/skate-cluster/main.go           | 26 ++++++------
 skate/cmd/skate-from-unstructured/main.go | 61 +---------------------------
 skate/unstructured.go                     | 66 +++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 72 deletions(-)
 create mode 100644 skate/unstructured.go

diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 754eab8..de11de1 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,5 @@
-// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
+// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
 // require refs and release docs in a single cluster).
 //
 // For example, this:
@@ -44,10 +44,12 @@ func main() {
 		batch, fields  []string
 		keyIndex       = *keyField - 1
 		docIndex       = *docField - 1
+		line           string
+		err            error
 	)
 	defer bw.Flush()
 	for {
-		line, err := br.ReadString('\n')
+		line, err = br.ReadString('\n')
 		if err == io.EOF {
 			break
 		}
@@ -79,16 +81,16 @@ func main() {
 
 // containsBoth return true, if we have a ref and a non-ref item in the batch.
 func containsBoth(batch []string) bool {
-	var isRef int
+	var numRef int
 	for _, doc := range batch {
-		// This is brittle. Most JSON should be in compact form, and there the
-		// following chars are by convention added to distinguish a release
-		// coming from a reference doc from other releases.
+		// This is brittle (but faster). Most JSON should be in compact form,
+		// and there the following chars are by convention added to distinguish
+		// a release coming from a reference doc from other releases.
 		if strings.Contains(doc, `"status":"ref"`) {
-			isRef++
+			numRef++
 		}
 	}
-	return isRef > 0 && isRef < len(batch)
+	return numRef > 0 && numRef < len(batch)
 }
 
 // writeBatch writes out a single line containing the key and the cluster values.
@@ -102,9 +104,9 @@ func writeBatch(w io.Writer, key string, batch []string) (err error) {
 	if *requireBoth && !containsBoth(batch) {
 		return nil
 	}
-	// This is brittle, but all items in a batch are valid JSON objects, hence,
-	// the following will be valid JSON as well, or will it? The key should not
-	// contain a quote.
+	// This is brittle (and fast), but all items in a batch are valid JSON
+	// objects, hence, the following will be valid JSON as well, or will it?
+	// The key should not contain a quote.
 	_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
 	return
 }
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index c2015e2..179057d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -6,9 +6,7 @@ import (
 	"flag"
 	"log"
 	"os"
-	"regexp"
 	"runtime"
-	"strings"
 
 	"git.archive.org/martin/cgraph/skate"
 	"git.archive.org/martin/cgraph/skate/parallel"
@@ -19,11 +17,6 @@ var (
 	numWorkers   = flag.Int("w", runtime.NumCPU(), "number of workers")
 	batchSize    = flag.Int("b", 100000, "batch size")
 	bytesNewline = []byte("\n")
-
-	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
-	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
-	PatArxivPDF    = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
-	PatArxivAbs    = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
 )
 
 func main() {
@@ -32,7 +25,7 @@ func main() {
 		if err := json.Unmarshal(p, &ref); err != nil {
 			return nil, err
 		}
-		if err := parseUnstructured(&ref); err != nil {
+		if err := skate.ParseUnstructured(&ref); err != nil {
 			return nil, err
 		}
 		return skate.JsonMarshalLine(&ref)
@@ -43,55 +36,3 @@ func main() {
 		log.Fatal(err)
 	}
 }
-
-// parseUnstructured will in-place augment missing DOI, arxiv id and so on.
-func parseUnstructured(ref *skate.Ref) error {
-	uns := ref.Biblio.Unstructured
-	var (
-		v  string
-		vs []string
-	)
-	// Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
-	// 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
-	if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
-		parts := strings.Split(strings.ToLower(ref.Key), "-bib")
-		ref.Biblio.DOI = parts[0]
-	}
-	// DOI
-	v = PatDOI.FindString(uns)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in Key
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// DOI in URL
-	prefixes := []string{
-		"http://doi.org/",
-		"https://doi.org/",
-		"http://dx.doi.org/",
-		"https://dx.doi.org/",
-	}
-	for _, prefix := range prefixes {
-		if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
-			ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
-		}
-	}
-	v = PatDOINoHyphen.FindString(ref.Key)
-	if v != "" && ref.Biblio.DOI == "" {
-		ref.Biblio.DOI = v
-	}
-	// Arxiv
-	vs = PatArxivPDF.FindStringSubmatch(uns)
-	if len(vs) != 0 && ref.Biblio.ArxivId == "" {
-		ref.Biblio.ArxivId = vs[1]
-	} else {
-		vs = PatArxivAbs.FindStringSubmatch(uns)
-		if len(vs) != 0 && ref.Biblio.ArxivId == "" {
-			ref.Biblio.ArxivId = vs[1]
-		}
-	}
-	return nil
-}
diff --git a/skate/unstructured.go b/skate/unstructured.go
new file mode 100644
index 0000000..6a96bb0
--- /dev/null
+++ b/skate/unstructured.go
@@ -0,0 +1,66 @@
+package skate
+
+import (
+	"regexp"
+	"strings"
+)
+
+var (
+	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
+	PatArxivPDF    = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+	PatArxivAbs    = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+
+	urlPrefixes = []string{
+		"http://doi.org/",
+		"https://doi.org/",
+		"http://dx.doi.org/",
+		"https://dx.doi.org/",
+	}
+)
+
+// ParseUnstructured will in-place augment missing DOI, arxiv id and so on.
+func ParseUnstructured(ref *Ref) error {
+	var (
+		uns = ref.Biblio.Unstructured
+		v   string
+		vs  []string
+	)
+	// Handle things like: 10.1111/j.1550-7408.1968.tb02138.x-BIB5|cit5,
+	// 10.1111/j.1558-5646.1997.tb02431.x-BIB0008|evo02431-cit-0008, ...
+	if strings.Contains(strings.ToLower(ref.Key), "-bib") && ref.Biblio.DOI == "" {
+		parts := strings.Split(strings.ToLower(ref.Key), "-bib")
+		ref.Biblio.DOI = parts[0]
+	}
+	// DOI
+	v = PatDOI.FindString(uns)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// DOI in Key
+	v = PatDOINoHyphen.FindString(ref.Key)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// DOI in URL
+	for _, prefix := range urlPrefixes {
+		if ref.Biblio.DOI != "" && strings.HasPrefix(ref.Biblio.Url, prefix) {
+			ref.Biblio.DOI = strings.Replace(ref.Biblio.Url, prefix, "", -1)
+		}
+	}
+	v = PatDOINoHyphen.FindString(ref.Key)
+	if v != "" && ref.Biblio.DOI == "" {
+		ref.Biblio.DOI = v
+	}
+	// Arxiv
+	vs = PatArxivPDF.FindStringSubmatch(uns)
+	if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+		ref.Biblio.ArxivId = vs[1]
+	} else {
+		vs = PatArxivAbs.FindStringSubmatch(uns)
+		if len(vs) != 0 && ref.Biblio.ArxivId == "" {
+			ref.Biblio.ArxivId = vs[1]
+		}
+	}
+	return nil
+}
-- 
cgit v1.2.3


From 2f584059a7ec85ac1977e90f5ffeae251f956eeb Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 5 May 2021 00:00:49 +0200
Subject: remove stub file

---
 skate/cmd/skate-bref-unmatched/main.go | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 skate/cmd/skate-bref-unmatched/main.go

diff --git a/skate/cmd/skate-bref-unmatched/main.go b/skate/cmd/skate-bref-unmatched/main.go
deleted file mode 100644
index d8cb34f..0000000
--- a/skate/cmd/skate-bref-unmatched/main.go
+++ /dev/null
@@ -1,10 +0,0 @@
-// skate-bref-unmatched takes a bref TSV sorted by source_release_ident and a
-// refs file sorted by release_ident and exports a bref file that will include
-// unmatched references as well.
-package main
-
-import "log"
-
-func main() {
-	log.Println("skate-bref-unmatched")
-}
-- 
cgit v1.2.3


From 13f89091ed93c5166e0fd969665e3e9f2c909ca9 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 5 May 2021 00:20:03 +0200
Subject: add test for ParseUnstructured

---
 skate/cmd/skate-wikipedia-doi/main.go |  1 +
 skate/schema.go                       | 34 +++++++++++-----------
 skate/unstructured.go                 |  4 +--
 skate/unstructured_test.go            | 53 +++++++++++++++++++++++++++++++++++
 4 files changed, 74 insertions(+), 18 deletions(-)
 create mode 100644 skate/unstructured_test.go

diff --git a/skate/cmd/skate-wikipedia-doi/main.go b/skate/cmd/skate-wikipedia-doi/main.go
index d1a21e9..c4fdb1e 100644
--- a/skate/cmd/skate-wikipedia-doi/main.go
+++ b/skate/cmd/skate-wikipedia-doi/main.go
@@ -1,3 +1,4 @@
+// skate-wikipedia-doi extracts DOI from wikipedia reference dataset.
 package main
 
 import (
diff --git a/skate/schema.go b/skate/schema.go
index a9570b7..9f3af45 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -112,24 +112,26 @@ func parseIsbn(s string) []string {
 	return valid.Slice()
 }
 
+type Biblio struct {
+	ArxivId         string   `json:"arxiv_id,omitempty"`
+	ContainerName   string   `json:"container_name,omitempty"`
+	ContribRawNames []string `json:"contrib_raw_names,omitempty"`
+	DOI             string   `json:"doi,omitempty"`
+	Issue           string   `json:"issue,omitempty"`
+	PMCID           string   `json:"pmcid,omitempty"`
+	PMID            string   `json:"pmid,omitempty"`
+	Pages           string   `json:"pages,omitempty"`
+	Publisher       string   `json:"publisher,omitempty"`
+	Title           string   `json:"title,omitempty"`
+	Unstructured    string   `json:"unstructured,omitempty"`
+	Url             string   `json:"url,omitempty"`
+	Volume          string   `json:"volume,omitempty"`
+	Year            int64    `json:"year,omitempty"`
+}
+
 // Ref is a reference document, can be very partial.
 type Ref struct {
-	Biblio struct {
-		ArxivId         string   `json:"arxiv_id,omitempty"`
-		ContainerName   string   `json:"container_name,omitempty"`
-		ContribRawNames []string `json:"contrib_raw_names,omitempty"`
-		DOI             string   `json:"doi,omitempty"`
-		Issue           string   `json:"issue,omitempty"`
-		PMCID           string   `json:"pmcid,omitempty"`
-		PMID            string   `json:"pmid,omitempty"`
-		Pages           string   `json:"pages,omitempty"`
-		Publisher       string   `json:"publisher,omitempty"`
-		Title           string   `json:"title,omitempty"`
-		Unstructured    string   `json:"unstructured,omitempty"`
-		Url             string   `json:"url,omitempty"`
-		Volume          string   `json:"volume,omitempty"`
-		Year            int64    `json:"year,omitempty"`
-	} `json:"biblio"`
+	Biblio       Biblio `json:"biblio"`
 	Index        int64  `json:"index,omitempty"`
 	Key          string `json:"key,omitempty"`
 	RefSource    string `json:"ref_source,omitempty"`
diff --git a/skate/unstructured.go b/skate/unstructured.go
index 6a96bb0..082c685 100644
--- a/skate/unstructured.go
+++ b/skate/unstructured.go
@@ -8,8 +8,8 @@ import (
 var (
 	PatDOI         = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
 	PatDOINoHyphen = regexp.MustCompile(`10[.][0-9]{1,8}/[^ -]*[\w]`)
-	PatArxivPDF    = regexp.MustCompile(`http://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
-	PatArxivAbs    = regexp.MustCompile(`http://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+	PatArxivPDF    = regexp.MustCompile(`https?://arxiv.org/pdf/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
+	PatArxivAbs    = regexp.MustCompile(`https?://arxiv.org/abs/([0-9]{4,4}[.][0-9]{1,8})(v[0-9]{1,2})?(.pdf)?`)
 
 	urlPrefixes = []string{
 		"http://doi.org/",
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
new file mode 100644
index 0000000..e6e9fbd
--- /dev/null
+++ b/skate/unstructured_test.go
@@ -0,0 +1,53 @@
+package skate
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestParseUnstructured(t *testing.T) {
+	var cases = []struct {
+		ref    *Ref
+		result *Ref
+		err    error
+	}{
+		{
+			&Ref{
+				Biblio: Biblio{
+					Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+				},
+			},
+			&Ref{
+				Biblio: Biblio{
+					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+					Unstructured: "Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+				},
+			},
+			nil,
+		},
+		{
+			&Ref{
+				Biblio: Biblio{
+					Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+				},
+			},
+			&Ref{
+				Biblio: Biblio{
+					ArxivId:      "0808.3320",
+					DOI:          "10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+					Unstructured: "https://arxiv.org/pdf/0808.3320v3.pdf Hello 10.1111/j.1550-7408.1968.tb02138.x-BIB5",
+				},
+			},
+			nil,
+		},
+	}
+	for _, c := range cases {
+		err := ParseUnstructured(c.ref)
+		if err != c.err {
+			t.Fatalf("got %v, want %v", err, c.err)
+		}
+		if !reflect.DeepEqual(c.ref, c.result) {
+			t.Fatalf("got %#v, want %#v", c.ref, c.result)
+		}
+	}
+}
-- 
cgit v1.2.3


From b3d54e661e0f1fc6bec23e06a16f26b2dd73a257 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 5 May 2021 00:21:19 +0200
Subject: make: run go mod tidy after build

---
 skate/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/skate/Makefile b/skate/Makefile
index 255bc28..39858bb 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -13,6 +13,7 @@ generate:
 
 .PHONY: all
 all: generate $(TARGETS)
+	go mod tidy
 
 %: cmd/%/main.go
 	go build -o $@ $<
-- 
cgit v1.2.3


From 134752c2a160986c13d6c2b9428cb2720ed382d0 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Wed, 5 May 2021 00:27:32 +0200
Subject: update notes

---
 skate/cmd/skate-dot/main.go | 3 ++-
 skate/unstructured_test.go  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/skate/cmd/skate-dot/main.go b/skate/cmd/skate-dot/main.go
index 5c11975..573209e 100644
--- a/skate/cmd/skate-dot/main.go
+++ b/skate/cmd/skate-dot/main.go
@@ -1,5 +1,6 @@
 // [wip] skate-dot generates dot files from inbound and outbound citation
-// links. Just a demo, replacement for a couple python scripts.
+// links. Just a demo, replacement for a couple python scripts. We want things
+// like: https://git.io/JObzq.
 package main
 
 import (
diff --git a/skate/unstructured_test.go b/skate/unstructured_test.go
index e6e9fbd..41ff471 100644
--- a/skate/unstructured_test.go
+++ b/skate/unstructured_test.go
@@ -6,6 +6,7 @@ import (
 )
 
 func TestParseUnstructured(t *testing.T) {
+	// XXX: add more cases, maybe move this into files.
 	var cases = []struct {
 		ref    *Ref
 		result *Ref
-- 
cgit v1.2.3