From 9a75e6d549d36b68e7f58c9c1494a6d89071bf90 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 7 Sep 2021 20:45:35 +0200
Subject: doc: tweaks

---
 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 104442 -> 104537 bytes
 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex |  18 ++++++++----------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 933338a..e4c1361 100644
Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index 682a3bc..e99ddc3 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -98,7 +98,7 @@ strings partially describing a scholarly artifact.
 
 \section{Related Work}
 
-Two typical problems that arise in the process of compiling a citation graph
+Two typical problems which arise in the process of compiling a citation graph
 dataset are related to data aquisition and citation matching. Data acquisition
 itself can take different forms: bibliographic metadata can contain explicit
 reference data as provided by publishers and aggregators; this data can be
@@ -127,14 +127,10 @@ Projects centered around citations or containing citation data as a core
 component are COCI, the ``OpenCitations Index of Crossref open DOI-to-DOI
 citations'', which was first released
 2018-07-29\footnote{\url{https://opencitations.net/download}} and has been
-regularly updated~\citep{peroni2020opencitations}.
-
-The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
+regularly updated~\citep{peroni2020opencitations}. The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
 ``a Wikimedia initiative to develop open citations and linked bibliographic
 data to serve free knowledge'' continously adds citations to its
-database\footnote{\url{http://wikicite.org/statistics.html}}.
-
-Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of
+database\footnote{\url{http://wikicite.org/statistics.html}}. Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of
 entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
 with \emph{PaperReferences} being one relation among many others.
 
@@ -225,6 +221,8 @@ seen in~Table~\ref{table:cocicmp}.
 % zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
 % find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
 
+% TODO: some more numbers on the structure
+
 
 \section{System Design}
 
@@ -278,7 +276,7 @@ PDF extraction. The bibliographic metadata is taken from fatcat, which itself
 harvests and imports web accessible sources such as Crossref, Pubmed, Arxiv,
 Datacite, DOAJ, dblp and others into its catalog (as the source permits, data
 is processed continously or in batches). Reference data from PDF documents has
-been extracted with GROBID\footnote{GROBID v0.5.5}, with the TEI-XML results
+been extracted with GROBID\footnote{GROBID \href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the TEI-XML results
 being cached locally in a key-value store accessible with an S3 API. Archived
 PDF documents result from dedicated web-scale crawls of scholarly domains
 conducted with
@@ -321,8 +319,8 @@ framework\footnote{\url{https://github.com/spotify/luigi}~\citep{bernhardsson201
 	application, like~\citep{schulz2016use},~\citep{erdmann2017design},~\citep{lampa2019scipipe},~\citep{czygan2014design}
 	and others.} allows for experimentation in the pipeline and for single command
 derivations, as data dependencies are encoded with the help of the
-orchestrator. Within the tasks, we also utilize classic platfrom tools such as
-sort~\citep{mcilroy1971research}.
+orchestrator. Within the tasks, we also utilize classic platform tools such as
+\emph{sort}~\citep{mcilroy1971research}.
 
 With a few schema conversions, fuzzy matching can be applied to Wikipedia
 articles and Open Library (edition) records as well. The aspect of precision
-- 
cgit v1.2.3