From aa54475d6972d972a2179c1244450adc5d56a477 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 29 Apr 2021 15:03:02 +0200 Subject: update docs --- skate/README.md | 6 +++++- skate/cmd/skate-map/main.go | 32 +++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/skate/README.md b/skate/README.md index 323e786..11f294b 100644 --- a/skate/README.md +++ b/skate/README.md @@ -79,7 +79,11 @@ Sanitize DOI in tabular file. Run various matching and verification algorithms. -## Problem +### skate-map + +A more generic version of derive key. + +## Misc Handling a TB of JSON and billions of documents, especially for the following use case: diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 5b69993..259e1aa 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -1,5 +1,35 @@ // skate-map runs a given map function over input data. We mostly want to -// extract a key from a json document. +// extract a key from a json document. For simple cases, you can use `jq` and +// other tools. Some key derivations require a bit more. +// +// An example with mostly unix tools. We want to extract the DOI and sort by +// it; we also want to do this fast, hence parallel, LC_ALL, etc. +// +// $ zstdcat -T0 file.zst | (1) +// LC_ALL=C tr -d '\t' | (2) * +// parallel -j 16 --block 10M --pipe (3) * +// "jq -rc 'select(.biblio.doi != null) | (4) * +// [.biblio.doi, (.|tostring)] | @tsv'" | (5) * +// LC_ALL=C sed 's/\\\\/\\/g' | (6) * +// LC_ALL=C awk -F $'\t' -v OFS='\t' '$1=tolower($1)' | (7) * +// skate-to-doi -B -S -f 1 | (8) * +// LC_ALL=C sort -S 30% --parallel 6 -k1,1 | (9) +// zstd -c -T0 > skate.out +// +// (1) zstd is fast! "~4x faster than zlib" (https://is.gd/HT1DUs) +// (2) we use tab as column separator and we want clean this up before (could +// be skipped, if we limit number of splits) +// (3) we pass the data to jq, with a bit larger buffer (default is 1MB) +// (4) we want no "null" output +// (5) tostring prints input as string, because we need to carry the document forward +// (6) but we need some cleanup, too +// (7) we normalize the DOI to lowercase +// (8) a custom filter to normalize a DOI in a specific column +// (9) sorting by DOI +// +// This is reasonably fast, but some cleanup is ugly. We also want more complex +// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8). + package main import ( -- cgit v1.2.3