From d616b41778cad391d6f71cf909def982ac2775b3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 8 Oct 2021 21:23:13 +0200 Subject: doc: report tweaks --- docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 140827 -> 139904 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 79 ++++++++++++++------------ 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf index 0ec16c7..e820aca 100644 Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index e1f985c..76fd5d3 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -267,19 +267,22 @@ references: after an initial cleaning procedure we currently find 25,405,592 web links\footnote{The cleaning process is necessary because OCR artifacts and other metadata issues exist in the data. Unfortunately, even after cleaning not all links will be in the form as originally intended by the authors.} in the -reference corpus, of which 4,827,688 have been preserved as of August 2021 with an HTTP 200 +reference corpus, of which 4,828,283 (19\%) have been preserved as of August 2021 with an HTTP 200 status code in the Wayback Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of -the Internet Archive. - -In a random sample of 8000 links we find only 6138 responding -with an HTTP 200 OK, whereas the rest of the links yield a variety of HTTP status -codes, like 404, 403, 500 and others - resulting in 23\% of the links -in the reference corpus preserved at the Internet Archive being currently inaccessible on -the web\footnote{We used the - \href{https://github.com/miku/clinker}{https://github.com/miku/clinker} command - line link checking tool.} - making targeted web crawling and preservation of -scholarly references a key activity for maintaining citation integrity. +the Internet Archive. As an upper bound - if we include all redirection (HTTP +3XX) and server error status code (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved. + +We ran a live URL check over a sample of 364415 links found in the reference +corpus. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP +200 OK, whereas the rest of the links yield a variety of HTTP status codes, +like 404, 403, 500 and others - resulting in about 16\% of the links in the +reference corpus preserved at the Internet Archive being currently inaccessible +on the web\footnote{We used the + \href{https://github.com/miku/clinker}{https://github.com/miku/clinker} + command line link checking tool.} - making targeted web crawling and +preservation of scholarly references a key activity for maintaining +citation integrity. % unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W" @@ -303,33 +306,35 @@ extraction tools. Each combination of fields may require a slightly different processing path. For example, references with an Arxiv identifier can be processed differently -from references with only a title. Over 50\% of the raw reference data comes -from a set of eight field set variants, as listed in -Table~\ref{table:fields}. +from references with only a title. -\begin{table}[] - \begin{center} - \begin{tabular}{ll} - \toprule - \bf{Fields} & \bf{Percentage} \\ - \midrule - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ - \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ - \end{tabular} - \vspace*{2mm} - \caption{Top 8 combinations of available fields in raw reference data - accounting for about 53\% of the total data (CN = container name, CRN = - contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = - issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} - \label{table:fields} - \end{center} -\end{table} +% Over 50\% of the raw reference data comes +% from a set of eight field set variants, as listed in +% Table~\ref{table:fields}. +% +% \begin{table}[] +% \begin{center} +% \begin{tabular}{ll} +% \toprule +% \bf{Fields} & \bf{Percentage} \\ +% \midrule +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ +% \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ +% \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ +% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ +% \end{tabular} +% \vspace*{2mm} +% \caption{Top 8 combinations of available fields in raw reference data +% accounting for about 53\% of the total data (CN = container name, CRN = +% contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = +% issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} +% \label{table:fields} +% \end{center} +% \end{table} \subsection{Data Sources} -- cgit v1.2.3