From 0f9f1386d9ad66040d5a546bf037f41c173a84ab Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 5 Aug 2021 18:40:29 +0200
Subject: wip: paper

---
 docs/Simple/main.pdf | Bin 99828 -> 88783 bytes
 docs/Simple/main.tex |  31 +++++++++++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'docs/Simple')

diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
index 469fa95..3a66061 100644
Binary files a/docs/Simple/main.pdf and b/docs/Simple/main.pdf differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index 36f2074..b0fbaed 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -10,6 +10,7 @@
 \usepackage{booktabs}       % professional-quality tables
 \usepackage{amsfonts}       % blackboard math symbols
 \usepackage{nicefrac}       % compact symbols for 1/2, etc.
+\usepackage{caption}
 
 \usepackage{datetime}
 \providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
@@ -158,14 +159,15 @@ TODO: how matches are established and a short note on overlap with COCI DOI.
 \section{System Design}
 
 The constraints for the systems design are informed by the volume and the
-variety of the data. In total, the raw inputs amount to a few TB of textual
-content, mostly newline delimited JSON. More importantly, while the number of
-data fields is low, certain schemas are very partial with hundreds of different
-combinations of available field values found in the raw reference data. This is
-most likely caused by aggregators passing on reference data coming from
-hundreds of sources, each of which not necessarily agreeing on a common
-granularity for citation data and from artifacts of machine learning based
-structured data extraction tools.
+variety of the data. The capability to run the graph whole derivation on a
+single machine (commodity hardware) was a minor goal as well. In total, the raw inputs amount to a few
+TB of textual content, mostly newline delimited JSON. More importantly, while
+the number of data fields is low, certain schemas are very partial with
+hundreds of different combinations of available field values found in the raw
+reference data. This is most likely caused by aggregators passing on reference
+data coming from hundreds of sources, each of which not necessarily agreeing on
+a common granularity for citation data and from artifacts of machine learning
+based structured data extraction tools.
 
 Each combination of fields may require a slightly different processing path.
 For example, references with an Arxiv identifier can be processed differently
@@ -262,8 +264,17 @@ harder or not possible at all.
 
 
 \section{Appendix A}
+
+
+A note on data quality: While we implement various data quality measures,
+real-world data, especially coming from many different sources will contain
+errors and bugs. Among other measures, we keep track of match reasons,
+especially for fuzzy matching to be able to zoom in on systematic errors a bit
+more easily (see~Table~\ref{table:matches}).
+
 \begin{table}[]
     \footnotesize
+    \captionsetup{font=normalsize}
     \begin{center}
 \begin{tabular}{@{}rlll@{}}
 \toprule
@@ -319,11 +330,11 @@ harder or not possible at all.
 1                          & fatcat                    & exact                 & doi                   \\ \bottomrule
 \end{tabular}
     \vspace*{2mm}
-	\caption{Table of match counts, reference provenance, match status and
+    \caption{Table of match counts, reference provenance, match status and
 match reason. The match reason identifier encode a specific rule in the domain
 dependent verification process and are included for completeness - we do not
 include the details of each rule in this report.}
-    \label{table:fields}
+    \label{table:matches}
 \end{center}
 \end{table}
 
-- 
cgit v1.2.3