aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-10-08 22:18:04 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-10-08 22:18:04 +0200
commitfffb6c9969ad2b99db57b805be4192040250637e (patch)
tree3dd4c8fef75b88b41a628da73f9c031fcc767d22
parent19493e27833475a43c8ec0f6308a447a4362de91 (diff)
downloadrefcat-fffb6c9969ad2b99db57b805be4192040250637e.tar.gz
refcat-fffb6c9969ad2b99db57b805be4192040250637e.zip
docs: report tweaks
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.pdfbin128892 -> 130701 bytes
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex30
2 files changed, 17 insertions, 13 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 1fb8f0c..4d34e91 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
Binary files differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index 8651031..11ea1fa 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -12,6 +12,7 @@
\usepackage{nicefrac} % compact symbols for 1/2, etc.
\usepackage{caption}
\usepackage{datetime}
+\usepackage{float}
\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
\setlength{\parindent}{0pt}
@@ -203,14 +204,14 @@ and \emph{refcat} overlap to the most part, as can be seen in~Table~\ref{table:c
\begin{center}
\begin{tabular}{lll}
\toprule
- \bf{Set} & & \bf{Count} \\
+ \bf{Set} & & \bf{Count} \\
\midrule
- COCIv11 (C) & & 1,186,958,897 \\ % zstdcat -T0 6741422v11.csv.zst | pv -l | wc -l
- \emph{refcat-doi} (R) & & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst # LC_ALL=C wc -l uniq_34_doi_lower_sorted.csv
- C $\cap$ R & overlap & 1,046,438,515 \\
- C $\setminus$ R & COCIv11 only & 140,520,382 \\ % 86,854,309 \\
- R $\setminus$ C & refcat-doi only & 256,985,697 \\ % xxx 295,884,246
+ COCIv11 (C) & & 1,186,958,897 \\ % zstdcat -T0 6741422v11.csv.zst | pv -l | wc -l
+ \emph{refcat-doi} (R) & & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst # LC_ALL=C wc -l uniq_34_doi_lower_sorted.csv
+ C $\cap$ R & overlap & 1,046,438,515 \\
+ C $\setminus$ R & COCIv11 only & 140,520,382 \\ % 86,854,309 \\
+ R $\setminus$ C & \emph{refcat-doi} only & 256,985,697 \\ % xxx 295,884,246
\end{tabular}
\vspace*{2mm}
\caption{Comparison between Open Citations COCI corpus (v11,
@@ -270,11 +271,11 @@ web links\footnote{The cleaning process is necessary because OCR artifacts and
reference corpus, of which 4,828,283 (19\%) have been preserved as of August 2021 with an HTTP 200
status code in the Wayback
Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
-the Internet Archive. As an upper bound - if we include all redirection (HTTP
-3XX) and server error status code (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved.
+the Internet Archive. As an upper bound - if we additionally include all redirection (HTTP
+3XX) and server error status codes (HTTP 5XX) - we find a total of 14,306,019 (56.3\%) links preserved.
-We ran a live URL check over a sample of 364415 links found in the reference
-corpus. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP
+We ran a live URL check\footnote{All links accessed on 2021-10-04 and 2021-10-05.} over a sample of 364415 links which appear in the reference
+corpus \emph{and} have a HTTP 200 status code archival copy in the Wayback Machine. Of the 364415 links we find 305476 (83.8\%) responding with an HTTP
200 OK, whereas the rest of the links yield a variety of HTTP status codes,
like 404, 403, 500 and others - resulting in about 16\% of the links in the
reference corpus preserved at the Internet Archive being currently inaccessible
@@ -447,8 +448,9 @@ This work is partially supported by grants from the \emph{Andrew W. Mellon
Foundation}, especially ''Ensuring the Persistent Access of Open Access Journal
Literature: Phase II`` (1910-07256, Jefferson Bailey, Principal Investigator).
+\appendix
-\section{Appendix A}
+\section*{Appendix: Reference Relations}
Figure~\ref{fig:types} shows the schematic reference relations.
@@ -461,7 +463,9 @@ Figure~\ref{fig:types} shows the schematic reference relations.
\label{fig:types}
\end{figure}
-\section{Appendix B}
+% \pagebreak{}
+
+\section*{Appendix: Data Quality}
A note on data quality: While we implement various data quality measures,
@@ -470,7 +474,7 @@ issues. Among other measures, we keep track of match reasons,
especially for fuzzy matching to be able to zoom in on systematic errors
more easily (see~Table~\ref{table:matches}).
-\begin{table}[]
+\begin{table}[H]
\footnotesize
\captionsetup{font=normalsize}
\begin{center}