From d616b41778cad391d6f71cf909def982ac2775b3 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 8 Oct 2021 21:23:13 +0200
Subject: doc: report tweaks

---
 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 140827 -> 139904 bytes
 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex |  79 ++++++++++++++------------
 2 files changed, 42 insertions(+), 37 deletions(-)

(limited to 'docs')

diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 0ec16c7..e820aca 100644
Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index e1f985c..76fd5d3 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -267,19 +267,22 @@ references: after an initial cleaning procedure we currently find 25,405,592
 web links\footnote{The cleaning process is necessary because OCR artifacts and
 	other metadata issues exist in the data. Unfortunately, even after cleaning not
 	all links will be in the form as originally intended by the authors.} in the
-reference corpus, of which 4,827,688 have been preserved as of August 2021 with an HTTP 200
+reference corpus, of which 4,828,283 (19\%) have been preserved as of August 2021 with an HTTP 200
 status code in the Wayback
 Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
-the Internet Archive.
-
-In a random sample of 8000 links we find only 6138 responding
-with an HTTP 200 OK, whereas the rest of the links yield a variety of HTTP status
-codes, like 404, 403, 500 and others - resulting in 23\% of the links
-in the reference corpus preserved at the Internet Archive being currently inaccessible on
-the web\footnote{We used the
-	\href{https://github.com/miku/clinker}{https://github.com/miku/clinker} command
-	line link checking tool.} - making targeted web crawling and preservation of
-scholarly references a key activity for maintaining citation integrity.
+the Internet Archive. As an upper bound - if we include all redirection (HTTP
+3XX) and server error status code (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved.
+
+We ran a live URL check over a sample of 364415 links found in the reference
+corpus. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP
+200 OK, whereas the rest of the links yield a variety of HTTP status codes,
+like 404, 403, 500 and others - resulting in about 16\% of the links in the
+reference corpus preserved at the Internet Archive being currently inaccessible
+on the web\footnote{We used the
+	\href{https://github.com/miku/clinker}{https://github.com/miku/clinker}
+	command line link checking tool.} - making targeted web crawling and
+preservation of scholarly references a key activity for maintaining
+citation integrity.
 
 % unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W"
 
@@ -303,33 +306,35 @@ extraction tools.
 
 Each combination of fields may require a slightly different processing path.
 For example, references with an Arxiv identifier can be processed differently
-from references with only a title. Over 50\% of the raw reference data comes
-from a set of eight field set variants, as listed in
-Table~\ref{table:fields}.
+from references with only a title.
 
-\begin{table}[]
-	\begin{center}
-		\begin{tabular}{ll}
-			\toprule
-			\bf{Fields}                                                                                     & \bf{Percentage} \\
-			\midrule
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}          & 14\%            \\
-			\multicolumn{1}{l}{\textbf{DOI}}                                                                & 14\%            \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}           & 4\%             \\
-			\multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}                                                     & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}           & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}                                                    & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}                     & 4\%             \\
-		\end{tabular}
-		\vspace*{2mm}
-		\caption{Top 8 combinations of available fields in raw reference data
-			accounting for about 53\% of the total data (CN = container name, CRN =
-			contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
-			issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
-		\label{table:fields}
-	\end{center}
-\end{table}
+% Over 50\% of the raw reference data comes
+% from a set of eight field set variants, as listed in
+% Table~\ref{table:fields}.
+%
+% \begin{table}[]
+% 	\begin{center}
+% 		\begin{tabular}{ll}
+% 			\toprule
+% 			\bf{Fields}                                                                                     & \bf{Percentage} \\
+% 			\midrule
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}          & 14\%            \\
+% 			\multicolumn{1}{l}{\textbf{DOI}}                                                                & 14\%            \\
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%             \\
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}           & 4\%             \\
+% 			\multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}                                                     & 4\%             \\
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}           & 4\%             \\
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}                                                    & 4\%             \\
+% 			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}                     & 4\%             \\
+% 		\end{tabular}
+% 		\vspace*{2mm}
+% 		\caption{Top 8 combinations of available fields in raw reference data
+% 			accounting for about 53\% of the total data (CN = container name, CRN =
+% 			contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+% 			issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+% 		\label{table:fields}
+% 	\end{center}
+% \end{table}
 
 \subsection{Data Sources}
 
-- 
cgit v1.2.3