update docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-04-29 15:03:02 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-04-29 15:03:02 +0200
commit: aa54475d6972d972a2179c1244450adc5d56a477 (patch)
tree: f8f637600d71ab8057075bd5a8f7e97b8488ce9b
parent: 2aaf3b533d1a21f4bfe620f75642108407f4b3a2 (diff)
download: refcat-aa54475d6972d972a2179c1244450adc5d56a477.tar.gz
refcat-aa54475d6972d972a2179c1244450adc5d56a477.zip
2 files changed, 36 insertions, 2 deletions
diff --git a/skate/README.md b/skate/README.md
index 323e786..11f294b 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -79,7 +79,11 @@ Sanitize DOI in tabular file.
 
 Run various matching and verification algorithms.
 
-## Problem
+### skate-map
+
+A more generic version of derive key.
+
+## Misc
 
 Handling a TB of JSON and billions of documents, especially for the following
 use case:
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 5b69993..259e1aa 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -1,5 +1,35 @@
 // skate-map runs a given map function over input data. We mostly want to
-// extract a key from a json document.
+// extract a key from a json document. For simple cases, you can use `jq` and
+// other tools.  Some key derivations require a bit more.
+//
+// An example with mostly unix tools. We want to extract the DOI and sort by
+// it; we also want to do this fast, hence parallel, LC_ALL, etc.
+//
+// $ zstdcat -T0 file.zst |                                  (1)
+//     LC_ALL=C tr -d '\t' |                                 (2) *
+//     parallel -j 16 --block 10M --pipe                     (3) *
+//         "jq -rc 'select(.biblio.doi != null) |            (4) *
+//             [.biblio.doi, (.|tostring)] | @tsv'" |        (5) *
+//     LC_ALL=C sed 's/\\\\/\\/g' |                          (6) *
+//     LC_ALL=C awk -F $'\t' -v OFS='\t' '$1=tolower($1)' |  (7) *
+//     skate-to-doi -B -S -f 1 |                             (8) *
+//     LC_ALL=C sort -S 30% --parallel 6 -k1,1 |             (9)
+//     zstd -c -T0 > skate.out
+//
+// (1) zstd is fast! "~4x faster than zlib" (https://is.gd/HT1DUs)
+// (2) we use tab as column separator and we want clean this up before (could
+//     be skipped, if we limit number of splits)
+// (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
+// (4) we want no "null" output
+// (5) tostring prints input as string, because we need to carry the document forward
+// (6) but we need some cleanup, too
+// (7) we normalize the DOI to lowercase
+// (8) a custom filter to normalize a DOI in a specific column
+// (9) sorting by DOI
+//
+// This is reasonably fast, but some cleanup is ugly. We also want more complex
+// keys, e.g. more normalizations, etc.  We'd like to encapsulate (2) to (8).
+
 package main
 
 import (
author	Martin Czygan <martin.czygan@gmail.com>	2021-04-29 15:03:02 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-04-29 15:03:02 +0200
commit	aa54475d6972d972a2179c1244450adc5d56a477 (patch)
tree	f8f637600d71ab8057075bd5a8f7e97b8488ce9b
parent	2aaf3b533d1a21f4bfe620f75642108407f4b3a2 (diff)
download	refcat-aa54475d6972d972a2179c1244450adc5d56a477.tar.gz refcat-aa54475d6972d972a2179c1244450adc5d56a477.zip