From 730612615d6c3919f98cbb5aeaa9956b8b1a65c7 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 3 Aug 2021 00:46:12 +0200 Subject: update docs --- docs/TR-20210730212057-IA-WDS-CG/main.pdf | Bin 98138 -> 99346 bytes docs/TR-20210730212057-IA-WDS-CG/main.tex | 13 +++++++++++++ skate/map.go | 2 ++ 3 files changed, 15 insertions(+) diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.pdf b/docs/TR-20210730212057-IA-WDS-CG/main.pdf index ddff7fe..c8bb5a3 100644 Binary files a/docs/TR-20210730212057-IA-WDS-CG/main.pdf and b/docs/TR-20210730212057-IA-WDS-CG/main.pdf differ diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex index faeab73..a7edac3 100644 --- a/docs/TR-20210730212057-IA-WDS-CG/main.tex +++ b/docs/TR-20210730212057-IA-WDS-CG/main.tex @@ -246,6 +246,19 @@ time limited. Map and reduce operations are parallelized and certain processing steps can process 100K documents per second or even more on commodity hardware with spinning disks. +\section{Quality Assurance} + +Understanding data quality plays a role, as the data is coming from a myriad of +sources, each with possible idiosyncratic features or missing values. We employ +a few QA measures during the process. First, we try to pass each data item +through only one processing pipeline (e.g. items matched by any identifier +should not even be considered for fuzzy matching). If duplicate links appear in +the final dataset nonetheless, we remove them, prefering exact over fuzzy matches. + +We employ a couple of data cleaning techniques, e.g. to find and verify +identifiers like ISBN or to sanitize URLs found in the data. Many of these +artifacts stem from the fact that large chunks of the raw data come from +heuristic data extraction from PDF documents. \section{Discussion} diff --git a/skate/map.go b/skate/map.go index 34e2f2c..ca98186 100644 --- a/skate/map.go +++ b/skate/map.go @@ -113,6 +113,8 @@ type Mapper func([]byte) ([][]byte, error) // AsTSV serializes the result of a field mapper as TSV. This is a slim // adapter, e.g. to parallel.Processor, which expects this function signature. // A newline will be appended, if not there already. +// +// Anecdotally a parallelized implementation of a mapper can process around 300MiB/s. func (f Mapper) AsTSV(p []byte) ([]byte, error) { var ( fields [][]byte -- cgit v1.2.3