aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-10-08 21:23:13 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-10-08 21:23:13 +0200
commitd616b41778cad391d6f71cf909def982ac2775b3 (patch)
treeb1a1b81b110fc889a8411e296f628bf0ecbe0c65
parenta593442e8b38f27da039ae20ae5e7b49e5dafdd1 (diff)
downloadrefcat-d616b41778cad391d6f71cf909def982ac2775b3.tar.gz
refcat-d616b41778cad391d6f71cf909def982ac2775b3.zip
doc: report tweaks
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.pdfbin140827 -> 139904 bytes
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex79
2 files changed, 42 insertions, 37 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 0ec16c7..e820aca 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
Binary files differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index e1f985c..76fd5d3 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -267,19 +267,22 @@ references: after an initial cleaning procedure we currently find 25,405,592
web links\footnote{The cleaning process is necessary because OCR artifacts and
other metadata issues exist in the data. Unfortunately, even after cleaning not
all links will be in the form as originally intended by the authors.} in the
-reference corpus, of which 4,827,688 have been preserved as of August 2021 with an HTTP 200
+reference corpus, of which 4,828,283 (19\%) have been preserved as of August 2021 with an HTTP 200
status code in the Wayback
Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
-the Internet Archive.
-
-In a random sample of 8000 links we find only 6138 responding
-with an HTTP 200 OK, whereas the rest of the links yield a variety of HTTP status
-codes, like 404, 403, 500 and others - resulting in 23\% of the links
-in the reference corpus preserved at the Internet Archive being currently inaccessible on
-the web\footnote{We used the
- \href{https://github.com/miku/clinker}{https://github.com/miku/clinker} command
- line link checking tool.} - making targeted web crawling and preservation of
-scholarly references a key activity for maintaining citation integrity.
+the Internet Archive. As an upper bound - if we include all redirection (HTTP
+3XX) and server error status code (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved.
+
+We ran a live URL check over a sample of 364415 links found in the reference
+corpus. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP
+200 OK, whereas the rest of the links yield a variety of HTTP status codes,
+like 404, 403, 500 and others - resulting in about 16\% of the links in the
+reference corpus preserved at the Internet Archive being currently inaccessible
+on the web\footnote{We used the
+ \href{https://github.com/miku/clinker}{https://github.com/miku/clinker}
+ command line link checking tool.} - making targeted web crawling and
+preservation of scholarly references a key activity for maintaining
+citation integrity.
% unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W"
@@ -303,33 +306,35 @@ extraction tools.
Each combination of fields may require a slightly different processing path.
For example, references with an Arxiv identifier can be processed differently
-from references with only a title. Over 50\% of the raw reference data comes
-from a set of eight field set variants, as listed in
-Table~\ref{table:fields}.
+from references with only a title.
-\begin{table}[]
- \begin{center}
- \begin{tabular}{ll}
- \toprule
- \bf{Fields} & \bf{Percentage} \\
- \midrule
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
- \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
- \end{tabular}
- \vspace*{2mm}
- \caption{Top 8 combinations of available fields in raw reference data
- accounting for about 53\% of the total data (CN = container name, CRN =
- contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
- issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
- \label{table:fields}
- \end{center}
-\end{table}
+% Over 50\% of the raw reference data comes
+% from a set of eight field set variants, as listed in
+% Table~\ref{table:fields}.
+%
+% \begin{table}[]
+% \begin{center}
+% \begin{tabular}{ll}
+% \toprule
+% \bf{Fields} & \bf{Percentage} \\
+% \midrule
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
+% \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
+% \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
+% \end{tabular}
+% \vspace*{2mm}
+% \caption{Top 8 combinations of available fields in raw reference data
+% accounting for about 53\% of the total data (CN = container name, CRN =
+% contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+% issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+% \label{table:fields}
+% \end{center}
+% \end{table}
\subsection{Data Sources}