aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-29 15:03:02 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-29 15:03:02 +0200
commitaa54475d6972d972a2179c1244450adc5d56a477 (patch)
treef8f637600d71ab8057075bd5a8f7e97b8488ce9b
parent2aaf3b533d1a21f4bfe620f75642108407f4b3a2 (diff)
downloadrefcat-aa54475d6972d972a2179c1244450adc5d56a477.tar.gz
refcat-aa54475d6972d972a2179c1244450adc5d56a477.zip
update docs
-rw-r--r--skate/README.md6
-rw-r--r--skate/cmd/skate-map/main.go32
2 files changed, 36 insertions, 2 deletions
diff --git a/skate/README.md b/skate/README.md
index 323e786..11f294b 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -79,7 +79,11 @@ Sanitize DOI in tabular file.
Run various matching and verification algorithms.
-## Problem
+### skate-map
+
+A more generic version of derive key.
+
+## Misc
Handling a TB of JSON and billions of documents, especially for the following
use case:
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 5b69993..259e1aa 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -1,5 +1,35 @@
// skate-map runs a given map function over input data. We mostly want to
-// extract a key from a json document.
+// extract a key from a json document. For simple cases, you can use `jq` and
+// other tools. Some key derivations require a bit more.
+//
+// An example with mostly unix tools. We want to extract the DOI and sort by
+// it; we also want to do this fast, hence parallel, LC_ALL, etc.
+//
+// $ zstdcat -T0 file.zst | (1)
+// LC_ALL=C tr -d '\t' | (2) *
+// parallel -j 16 --block 10M --pipe (3) *
+// "jq -rc 'select(.biblio.doi != null) | (4) *
+// [.biblio.doi, (.|tostring)] | @tsv'" | (5) *
+// LC_ALL=C sed 's/\\\\/\\/g' | (6) *
+// LC_ALL=C awk -F $'\t' -v OFS='\t' '$1=tolower($1)' | (7) *
+// skate-to-doi -B -S -f 1 | (8) *
+// LC_ALL=C sort -S 30% --parallel 6 -k1,1 | (9)
+// zstd -c -T0 > skate.out
+//
+// (1) zstd is fast! "~4x faster than zlib" (https://is.gd/HT1DUs)
+// (2) we use tab as column separator and we want clean this up before (could
+// be skipped, if we limit number of splits)
+// (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
+// (4) we want no "null" output
+// (5) tostring prints input as string, because we need to carry the document forward
+// (6) but we need some cleanup, too
+// (7) we normalize the DOI to lowercase
+// (8) a custom filter to normalize a DOI in a specific column
+// (9) sorting by DOI
+//
+// This is reasonably fast, but some cleanup is ugly. We also want more complex
+// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8).
+
package main
import (