aboutsummaryrefslogtreecommitdiffstats
path: root/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
diff options
context:
space:
mode:
Diffstat (limited to 'docs/TR-20210808100000-IA-WDS-REFCAT/main.tex')
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex71
1 files changed, 50 insertions, 21 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index 5d84e85..c95a7d6 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -81,7 +81,7 @@ Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
references~\citep{shotton2013publishing}. Other notable projects
include CiteSeer~\citep{giles1998citeseer}, CiteSeerX~\citep{wu2019citeseerx} and CitEc\footnote{\url{https://citec.repec.org}}. The last
decade has seen the emergence of more openly available, large scale
-citation projects like Microsoft Academic~\citep{sinha2015overview} or the
+citation projects like Microsoft Academic~\citep{sinha2015overview} and the
Initiative for Open Citations\footnote{\url{https://i4oc.org}}~\citep{shotton2018funders}.
In 2021, over one billion citations are publicly available, marking a ``tipping point''
for this category of data~\citep{hutchins2021tipping}.
@@ -179,31 +179,32 @@ information about the match status and provanance.
The dataset currently contains 1,323,423,672 citations across 76,327,662
entities (55,123,635 unique source and 60,244,206 unique target work
identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI
-for both source and target). The majority of matches - 1,250,523,321 - are
+for both source and target). The majority of matches - 1,250,523,321 - is
established through identifier based matching (DOI, PMIC, PMCID, ARXIV, ISBN).
-72,900,351 citations are established through fuzzy matching techniques. The
-majority of citations between COCI and \emph{refcat} overlap, as can be seen
-in~Table~\ref{table:cocicmp}.
+72,900,351 citations are established through fuzzy matching techniques.
+Citations from the Open Citations COCI corpus\footnote{Reference dataset COCI
+v11, released 2021-09-04,
+\href{http://opencitations.net/index/coci}{http://opencitations.net/index/coci}}
+and \emph{refcat} overlap to the most part, as can be seen in~Table~\ref{table:cocicmp}.
\begin{table}[]
\begin{center}
\begin{tabular}{ll}
\toprule
- \bf{Set} & \bf{Count} \\
+ \bf{Set} & \bf{Count} \\
\midrule
- COCI (C) & 1,094,394,688 \\
- \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
- C $\cap$ R & 1,007,539,966 \\
- C $\setminus$ R & 86,854,309 \\
- R $\setminus$ C & 295,884,246
+ COCIv11 (C) & 1,186,958,897 \\ % zstdcat -T0 6741422v11.csv.zst | pv -l | wc -l
+ \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst # LC_ALL=C wc -l uniq_34_doi_lower_sorted.csv
+ C $\cap$ R & xxx 1,007,539,966 \\
+ C $\setminus$ R & xxx 86,854,309 \\
+ R $\setminus$ C & xxx 295,884,246
\end{tabular}
\vspace*{2mm}
- \caption{Comparison between COCI and \emph{refcat-doi}, a subset of
- \emph{refcat} where entities have a known DOI. At least 50\% of the
- 295,884,246 references only in \emph{refcat-doi} come from links
- recorded within a specific dataset provider (GBIF, DOI prefix:
- 10.15468).}
+ \caption{Comparison between Open Citations COCI corpus (v11, 2021-09-04)
+ and \emph{refcat-doi}, a subset of \emph{refcat} where entities
+ have a known DOI. At least 50\% of the 295,884,246 references only
+ in \emph{refcat-doi} come from links recorded within a specific dataset provider (GBIF, DOI prefix: 10.15468).}
\label{table:cocicmp}
\end{center}
\end{table}
@@ -212,6 +213,9 @@ in~Table~\ref{table:cocicmp}.
% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
+% v11
+% time zstdcat -T0 /magna/data/opencitations/6741422v11.csv.zst | cut -d, -f2,3 | tr '[:upper:]' '[:lower:]' | LC_ALL=C sort -S50% -T /sandcrawler-db/tmp-refcat | pv -l > 6741422v11_doi_lower.csv
+
% TODO: some more numbers on the structure
% * doi-to-doi
@@ -224,12 +228,11 @@ in~Table~\ref{table:cocicmp}.
\begin{center}
\begin{tabular}{ll}
\toprule
- \bf{Edge type} & \bf{Count} \\
+ \bf{Edge type} & \bf{Count} \\
\midrule
- total & 1,323,423,672 \\
- doi-doi & 1,178,488,264 \\
- target-open-library & 1,552,931 \\
- source-wikipedia & 1,386,941 \\
+ doi-doi & xxx 1,178,488,264 \\
+ target-open-library & 20,307,064 \\
+ source-wikipedia & 1,386,941 \\
\end{tabular}
\vspace*{2mm}
\caption{Output structure, e.g. edges between documents that both have a doi (doi-doi).}
@@ -237,6 +240,32 @@ in~Table~\ref{table:cocicmp}.
\end{center}
\end{table}
+We started to include non-traditional citations into the graph, such as links
+to books as recorded by the Open Library Project and links from the English
+Wikipedia to scholarly works. For links between Open Library we employ both
+identifier based and fuzzy matching; for Wikipedia references we used an
+existing dataset~\citep{harshdeep_singh_2020_3940692} and we are contributing
+to upstream projects related to wikipedia citation extraction, such as
+\emph{wikiciteparser}\footnote{\href{https://github.com/dissemin/wikiciteparser}{https://github.com/dissemin/wikiciteparser}}
+to generate updates to the dataset. Table~\ref{table:structure} lists the
+counts for these links. Additionally, we are examining web links appearing in
+references: after an initial cleaning procedure we currently find 25,405,592
+web links\footnote{The cleaning process is necessary because OCR artifacts and
+other metadata issues exist in the data. Unfortunately, even after cleaning not
+all links will be in the form as originally intended by the authors.} in the
+reference corpus, of which 4,827,688 have been preserved with an HTTP 200
+status code in the Wayback
+Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of
+the Internet Archive. From a sample\footnote{In a sample of 8000 links we find
+only 6138 responding with a HTTP 200, whereas the rest of the links yields a
+variety of http status codes, like 404, 403, 500 and others.} we observe, that
+about 23\% of the links reference corpus links preserved at the Internet
+Archive are not accessible on the world wide web currently - making targeted
+web crawling and preservation of scholarly references an essential tool for
+maintaining citation integrity.
+
+% unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W"
+
\section{System Design}
\subsection{Constraints}