From eb057b6e2a1f74b9a74a164e13c9042332bb1244 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sun, 8 Aug 2021 10:16:43 +0200
Subject: wip: paper

---
 docs/Simple/main.pdf | Bin 91537 -> 92494 bytes
 docs/Simple/main.tex |  14 ++++++--------
 docs/Simple/refs.bib |  18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'docs')

diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
index d279395..9545257 100644
Binary files a/docs/Simple/main.pdf and b/docs/Simple/main.pdf differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index e754d98..2c3001d 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -201,7 +201,10 @@ issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any va
 \end{center}
 \end{table}
 
-Overall, a map-reduce style approach is followed, which allows for some
+Overall, a map-reduce style\citep{dean2010mapreduce} approach is
+followed\footnote{While the operations are similar, the processing is not
+distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
+for some
 uniformity in the overall processing. We extract (key, document) tuples (as
 TSV) from the raw JSON data and sort by key. We then group documents with the
 same key and apply a function on each group in order to generate
@@ -243,7 +246,7 @@ As other dataset in this field we expect this dataset to be iterated upon.
         processing based on updated data as it becomes available.
 
     \item Metadata extraction from PDFs depends on supervised machine learning
-        models, which in turn depends training sets. With additional crawls and
+        models, which in turn depends of available training sets. With additional crawls and
         metadata available we hope to improve models used for metadata
         extraction, improving yield and reducing data extraction artifacts in
         the process.
@@ -260,12 +263,7 @@ As other dataset in this field we expect this dataset to be iterated upon.
 \section{Acknowledgements}
 
 This work is partially supported by a grant from the \emph{Andrew W. Mellon
-Foundation}. We like to thanks various teams at the Internet Archive for
-providing necessary infrastructure, and also data processing expertise. We are
-also indebted to various open source software tools and their maintainers as
-well as open scholarly data projects - without those this work would be much
-harder if possible at all.
-
+Foundation}.
 
 
 \section{Appendix A}
diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib
index 599a386..5ae3fc8 100644
--- a/docs/Simple/refs.bib
+++ b/docs/Simple/refs.bib
@@ -160,3 +160,21 @@ note = {Accessed: 2021-07-30}
   year={2016}
 }
 
+@article{dean2010mapreduce,
+  title={MapReduce: a flexible data processing tool},
+  author={Dean, Jeffrey and Ghemawat, Sanjay},
+  journal={Communications of the ACM},
+  volume={53},
+  number={1},
+  pages={72--77},
+  year={2010},
+  publisher={ACM New York, NY, USA}
+}
+
+@article{collet2018zstandard,
+  title={Zstandard Compression and the application/zstd Media Type},
+  author={Collet, Yann and Kucherawy, Murray},
+  journal={RFC 8478},
+  year={2018}
+}
+
-- 
cgit v1.2.3