From eb057b6e2a1f74b9a74a164e13c9042332bb1244 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 8 Aug 2021 10:16:43 +0200 Subject: wip: paper --- docs/Simple/main.pdf | Bin 91537 -> 92494 bytes docs/Simple/main.tex | 14 ++++++-------- docs/Simple/refs.bib | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) (limited to 'docs') diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf index d279395..9545257 100644 Binary files a/docs/Simple/main.pdf and b/docs/Simple/main.pdf differ diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex index e754d98..2c3001d 100644 --- a/docs/Simple/main.tex +++ b/docs/Simple/main.tex @@ -201,7 +201,10 @@ issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any va \end{center} \end{table} -Overall, a map-reduce style approach is followed, which allows for some +Overall, a map-reduce style\citep{dean2010mapreduce} approach is +followed\footnote{While the operations are similar, the processing is not +distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows +for some uniformity in the overall processing. We extract (key, document) tuples (as TSV) from the raw JSON data and sort by key. We then group documents with the same key and apply a function on each group in order to generate @@ -243,7 +246,7 @@ As other dataset in this field we expect this dataset to be iterated upon. processing based on updated data as it becomes available. \item Metadata extraction from PDFs depends on supervised machine learning - models, which in turn depends training sets. With additional crawls and + models, which in turn depends of available training sets. With additional crawls and metadata available we hope to improve models used for metadata extraction, improving yield and reducing data extraction artifacts in the process. @@ -260,12 +263,7 @@ As other dataset in this field we expect this dataset to be iterated upon. \section{Acknowledgements} This work is partially supported by a grant from the \emph{Andrew W. Mellon -Foundation}. We like to thanks various teams at the Internet Archive for -providing necessary infrastructure, and also data processing expertise. We are -also indebted to various open source software tools and their maintainers as -well as open scholarly data projects - without those this work would be much -harder if possible at all. - +Foundation}. \section{Appendix A} diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib index 599a386..5ae3fc8 100644 --- a/docs/Simple/refs.bib +++ b/docs/Simple/refs.bib @@ -160,3 +160,21 @@ note = {Accessed: 2021-07-30} year={2016} } +@article{dean2010mapreduce, + title={MapReduce: a flexible data processing tool}, + author={Dean, Jeffrey and Ghemawat, Sanjay}, + journal={Communications of the ACM}, + volume={53}, + number={1}, + pages={72--77}, + year={2010}, + publisher={ACM New York, NY, USA} +} + +@article{collet2018zstandard, + title={Zstandard Compression and the application/zstd Media Type}, + author={Collet, Yann and Kucherawy, Murray}, + journal={RFC 8478}, + year={2018} +} + -- cgit v1.2.3