aboutsummaryrefslogtreecommitdiffstats
path: root/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
diff options
context:
space:
mode:
Diffstat (limited to 'docs/TR-20210808100000-IA-WDS-REFCAT/main.tex')
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex79
1 files changed, 42 insertions, 37 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index e1f985c..76fd5d3 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -267,19 +267,22 @@ references: after an initial cleaning procedure we currently find 25,405,592
web links\footnote{The cleaning process is necessary because OCR artifacts and
other metadata issues exist in the data. Unfortunately, even after cleaning not
all links will be in the form as originally intended by the authors.} in the
-reference corpus, of which 4,827,688 have been preserved as of August 2021 with an HTTP 200
+reference corpus, of which 4,828,283 (19\%) have been preserved as of August 2021 with an HTTP 200
status code in the Wayback
Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
-the Internet Archive.
-
-In a random sample of 8000 links we find only 6138 responding
-with an HTTP 200 OK, whereas the rest of the links yield a variety of HTTP status
-codes, like 404, 403, 500 and others - resulting in 23\% of the links
-in the reference corpus preserved at the Internet Archive being currently inaccessible on
-the web\footnote{We used the
- \href{https://github.com/miku/clinker}{https://github.com/miku/clinker} command
- line link checking tool.} - making targeted web crawling and preservation of
-scholarly references a key activity for maintaining citation integrity.
+the Internet Archive. As an upper bound - if we include all redirection (HTTP
+3XX) and server error status code (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved.
+
+We ran a live URL check over a sample of 364415 links found in the reference
+corpus. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP
+200 OK, whereas the rest of the links yield a variety of HTTP status codes,
+like 404, 403, 500 and others - resulting in about 16\% of the links in the
+reference corpus preserved at the Internet Archive being currently inaccessible
+on the web\footnote{We used the
+ \href{https://github.com/miku/clinker}{https://github.com/miku/clinker}
+ command line link checking tool.} - making targeted web crawling and
+preservation of scholarly references a key activity for maintaining
+citation integrity.
% unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W"
@@ -303,33 +306,35 @@ extraction tools.
Each combination of fields may require a slightly different processing path.
For example, references with an Arxiv identifier can be processed differently
-from references with only a title. Over 50\% of the raw reference data comes
-from a set of eight field set variants, as listed in
-Table~\ref{table:fields}.
+from references with only a title.
-\begin{table}[]
- \begin{center}
- \begin{tabular}{ll}
- \toprule
- \bf{Fields} & \bf{Percentage} \\
- \midrule
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
- \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
- \end{tabular}
- \vspace*{2mm}
- \caption{Top 8 combinations of available fields in raw reference data
- accounting for about 53\% of the total data (CN = container name, CRN =
- contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
- issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
- \label{table:fields}
- \end{center}
-\end{table}
+% Over 50\% of the raw reference data comes
+% from a set of eight field set variants, as listed in
+% Table~\ref{table:fields}.
+%
+% \begin{table}[]
+% \begin{center}
+% \begin{tabular}{ll}
+% \toprule
+% \bf{Fields} & \bf{Percentage} \\
+% \midrule
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
+% \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
+% \end{tabular}
+% \vspace*{2mm}
+% \caption{Top 8 combinations of available fields in raw reference data
+% accounting for about 53\% of the total data (CN = container name, CRN =
+% contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+% issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+% \label{table:fields}
+% \end{center}
+% \end{table}
\subsection{Data Sources}