docs: tr tweaks

author: Martin Czygan <martin.czygan@gmail.com> 2021-09-10 14:50:22 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-09-10 14:50:22 +0200
commit: 0e205c80d21c806b2779c2c3bc293e84a38b57b1 (patch)
tree: cdd9c2ead9159ffce383b26398370814fbc88e0d /docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
parent: ef690d9c8b28253d689bb74d6edb31c4611ef2f8 (diff)
download: refcat-0e205c80d21c806b2779c2c3bc293e84a38b57b1.tar.gz
refcat-0e205c80d21c806b2779c2c3bc293e84a38b57b1.zip
1 files changed, 50 insertions, 21 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index 5d84e85..c95a7d6 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -81,7 +81,7 @@ Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
 references~\citep{shotton2013publishing}. Other notable projects
 include CiteSeer~\citep{giles1998citeseer}, CiteSeerX~\citep{wu2019citeseerx} and CitEc\footnote{\url{https://citec.repec.org}}. The last
 decade has seen the emergence of more openly available, large scale
-citation projects like Microsoft Academic~\citep{sinha2015overview} or the
+citation projects like Microsoft Academic~\citep{sinha2015overview} and the
 Initiative for Open Citations\footnote{\url{https://i4oc.org}}~\citep{shotton2018funders}.
 In 2021, over one billion citations are publicly available, marking a ``tipping point''
 for this category of data~\citep{hutchins2021tipping}.
@@ -179,31 +179,32 @@ information about the match status and provanance.
 The dataset currently contains 1,323,423,672 citations across 76,327,662
 entities (55,123,635 unique source and 60,244,206 unique target work
 identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI
-for both source and target).  The majority of matches - 1,250,523,321 - are
+for both source and target).  The majority of matches - 1,250,523,321 - is
 established through identifier based matching (DOI, PMIC, PMCID, ARXIV, ISBN).
-72,900,351 citations are established through fuzzy matching techniques. The
-majority of citations between COCI and \emph{refcat} overlap, as can be seen
-in~Table~\ref{table:cocicmp}.
+72,900,351 citations are established through fuzzy matching techniques.
+Citations from the Open Citations COCI corpus\footnote{Reference dataset COCI
+v11, released 2021-09-04,
+\href{http://opencitations.net/index/coci}{http://opencitations.net/index/coci}}
+and \emph{refcat} overlap to the most part, as can be seen in~Table~\ref{table:cocicmp}.
 
 \begin{table}[]
 	\begin{center}
 		\begin{tabular}{ll}
 			\toprule
-			\bf{Set}              & \bf{Count}    \\
+			\bf{Set}              & \bf{Count}        \\
 
 			\midrule
-			COCI (C)              & 1,094,394,688 \\
-			\emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
-			C $\cap$ R            & 1,007,539,966 \\
-			C $\setminus$ R       & 86,854,309    \\
-			R $\setminus$ C       & 295,884,246
+			COCIv11 (C)              & 1,186,958,897     \\ % zstdcat -T0 6741422v11.csv.zst | pv -l | wc -l
+			\emph{refcat-doi} (R) &  1,303,424,212    \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst # LC_ALL=C wc -l uniq_34_doi_lower_sorted.csv
+			C $\cap$ R            & xxx 1,007,539,966 \\
+			C $\setminus$ R       & xxx 86,854,309    \\
+			R $\setminus$ C       & xxx 295,884,246
 		\end{tabular}
 		\vspace*{2mm}
-		\caption{Comparison between COCI and \emph{refcat-doi}, a subset of
-			\emph{refcat} where entities have a known DOI. At least 50\% of the
-			295,884,246 references only in \emph{refcat-doi} come from links
-			recorded within a specific dataset provider (GBIF, DOI prefix:
-			10.15468).}
+		\caption{Comparison between Open Citations COCI corpus (v11, 2021-09-04)
+			and \emph{refcat-doi}, a subset of \emph{refcat} where entities
+			have a known DOI. At least 50\% of the 295,884,246 references only
+			in \emph{refcat-doi} come from links recorded within a specific dataset provider (GBIF, DOI prefix: 10.15468).}
 		\label{table:cocicmp}
 	\end{center}
 \end{table}
@@ -212,6 +213,9 @@ in~Table~\ref{table:cocicmp}.
 % zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
 % find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
 
+% v11
+% time zstdcat -T0 /magna/data/opencitations/6741422v11.csv.zst | cut -d, -f2,3 | tr '[:upper:]' '[:lower:]' | LC_ALL=C sort -S50% -T /sandcrawler-db/tmp-refcat | pv -l > 6741422v11_doi_lower.csv
+
 % TODO: some more numbers on the structure
 
 % * doi-to-doi
@@ -224,12 +228,11 @@ in~Table~\ref{table:cocicmp}.
 	\begin{center}
 		\begin{tabular}{ll}
 			\toprule
-			\bf{Edge type}      & \bf{Count} \\
+			\bf{Edge type}      & \bf{Count}    \\
 			\midrule
-			total               & 1,323,423,672 \\
-			doi-doi             & 1,178,488,264 \\
-			target-open-library & 1,552,931  \\
-			source-wikipedia    & 1,386,941  \\
+			doi-doi             & xxx 1,178,488,264 \\
+			target-open-library & 20,307,064     \\
+			source-wikipedia    & 1,386,941     \\
 		\end{tabular}
 		\vspace*{2mm}
 		\caption{Output structure, e.g. edges between documents that both have a doi (doi-doi).}
@@ -237,6 +240,32 @@ in~Table~\ref{table:cocicmp}.
 	\end{center}
 \end{table}
 
+We started to include non-traditional citations into the graph, such as links
+to books as recorded by the Open Library Project and links from the English
+Wikipedia to scholarly works. For links between Open Library we employ both
+identifier based and fuzzy matching; for Wikipedia references we used an
+existing dataset~\citep{harshdeep_singh_2020_3940692} and we are contributing
+to upstream projects related to wikipedia citation extraction, such as
+\emph{wikiciteparser}\footnote{\href{https://github.com/dissemin/wikiciteparser}{https://github.com/dissemin/wikiciteparser}}
+to generate updates to the dataset. Table~\ref{table:structure} lists the
+counts for these links. Additionally, we are examining web links appearing in
+references: after an initial cleaning procedure we currently find 25,405,592
+web links\footnote{The cleaning process is necessary because OCR artifacts and
+other metadata issues exist in the data. Unfortunately, even after cleaning not
+all links will be in the form as originally intended by the authors.} in the
+reference corpus, of which 4,827,688 have been preserved with an HTTP 200
+status code in the Wayback
+Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
+the Internet Archive. From a sample\footnote{In a sample of 8000 links we find
+only 6138 responding with a HTTP 200, whereas the rest of the links yields a
+variety of http status codes, like 404, 403, 500 and others.} we observe, that
+about 23\% of the links reference corpus links preserved at the Internet
+Archive are not accessible on the world wide web currently - making targeted
+web crawling and preservation of scholarly references an essential tool for
+maintaining citation integrity.
+
+% unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W"
+
 \section{System Design}
 
 \subsection{Constraints}
author	Martin Czygan <martin.czygan@gmail.com>	2021-09-10 14:50:22 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-09-10 14:50:22 +0200
commit	0e205c80d21c806b2779c2c3bc293e84a38b57b1 (patch)
tree	cdd9c2ead9159ffce383b26398370814fbc88e0d /docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
parent	ef690d9c8b28253d689bb74d6edb31c4611ef2f8 (diff)
download	refcat-0e205c80d21c806b2779c2c3bc293e84a38b57b1.tar.gz refcat-0e205c80d21c806b2779c2c3bc293e84a38b57b1.zip