diff options
Diffstat (limited to 'docs/TR-20210808100000-IA-WDS-REFCAT/main.tex')
-rw-r--r-- | docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 71 |
1 files changed, 50 insertions, 21 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index 5d84e85..c95a7d6 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -81,7 +81,7 @@ Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 references~\citep{shotton2013publishing}. Other notable projects include CiteSeer~\citep{giles1998citeseer}, CiteSeerX~\citep{wu2019citeseerx} and CitEc\footnote{\url{https://citec.repec.org}}. The last decade has seen the emergence of more openly available, large scale -citation projects like Microsoft Academic~\citep{sinha2015overview} or the +citation projects like Microsoft Academic~\citep{sinha2015overview} and the Initiative for Open Citations\footnote{\url{https://i4oc.org}}~\citep{shotton2018funders}. In 2021, over one billion citations are publicly available, marking a ``tipping point'' for this category of data~\citep{hutchins2021tipping}. @@ -179,31 +179,32 @@ information about the match status and provanance. The dataset currently contains 1,323,423,672 citations across 76,327,662 entities (55,123,635 unique source and 60,244,206 unique target work identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI -for both source and target). The majority of matches - 1,250,523,321 - are +for both source and target). The majority of matches - 1,250,523,321 - is established through identifier based matching (DOI, PMIC, PMCID, ARXIV, ISBN). -72,900,351 citations are established through fuzzy matching techniques. The -majority of citations between COCI and \emph{refcat} overlap, as can be seen -in~Table~\ref{table:cocicmp}. +72,900,351 citations are established through fuzzy matching techniques. +Citations from the Open Citations COCI corpus\footnote{Reference dataset COCI +v11, released 2021-09-04, +\href{http://opencitations.net/index/coci}{http://opencitations.net/index/coci}} +and \emph{refcat} overlap to the most part, as can be seen in~Table~\ref{table:cocicmp}. \begin{table}[] \begin{center} \begin{tabular}{ll} \toprule - \bf{Set} & \bf{Count} \\ + \bf{Set} & \bf{Count} \\ \midrule - COCI (C) & 1,094,394,688 \\ - \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst - C $\cap$ R & 1,007,539,966 \\ - C $\setminus$ R & 86,854,309 \\ - R $\setminus$ C & 295,884,246 + COCIv11 (C) & 1,186,958,897 \\ % zstdcat -T0 6741422v11.csv.zst | pv -l | wc -l + \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst # LC_ALL=C wc -l uniq_34_doi_lower_sorted.csv + C $\cap$ R & xxx 1,007,539,966 \\ + C $\setminus$ R & xxx 86,854,309 \\ + R $\setminus$ C & xxx 295,884,246 \end{tabular} \vspace*{2mm} - \caption{Comparison between COCI and \emph{refcat-doi}, a subset of - \emph{refcat} where entities have a known DOI. At least 50\% of the - 295,884,246 references only in \emph{refcat-doi} come from links - recorded within a specific dataset provider (GBIF, DOI prefix: - 10.15468).} + \caption{Comparison between Open Citations COCI corpus (v11, 2021-09-04) + and \emph{refcat-doi}, a subset of \emph{refcat} where entities + have a known DOI. At least 50\% of the 295,884,246 references only + in \emph{refcat-doi} come from links recorded within a specific dataset provider (GBIF, DOI prefix: 10.15468).} \label{table:cocicmp} \end{center} \end{table} @@ -212,6 +213,9 @@ in~Table~\ref{table:cocicmp}. % zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst % find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst +% v11 +% time zstdcat -T0 /magna/data/opencitations/6741422v11.csv.zst | cut -d, -f2,3 | tr '[:upper:]' '[:lower:]' | LC_ALL=C sort -S50% -T /sandcrawler-db/tmp-refcat | pv -l > 6741422v11_doi_lower.csv + % TODO: some more numbers on the structure % * doi-to-doi @@ -224,12 +228,11 @@ in~Table~\ref{table:cocicmp}. \begin{center} \begin{tabular}{ll} \toprule - \bf{Edge type} & \bf{Count} \\ + \bf{Edge type} & \bf{Count} \\ \midrule - total & 1,323,423,672 \\ - doi-doi & 1,178,488,264 \\ - target-open-library & 1,552,931 \\ - source-wikipedia & 1,386,941 \\ + doi-doi & xxx 1,178,488,264 \\ + target-open-library & 20,307,064 \\ + source-wikipedia & 1,386,941 \\ \end{tabular} \vspace*{2mm} \caption{Output structure, e.g. edges between documents that both have a doi (doi-doi).} @@ -237,6 +240,32 @@ in~Table~\ref{table:cocicmp}. \end{center} \end{table} +We started to include non-traditional citations into the graph, such as links +to books as recorded by the Open Library Project and links from the English +Wikipedia to scholarly works. For links between Open Library we employ both +identifier based and fuzzy matching; for Wikipedia references we used an +existing dataset~\citep{harshdeep_singh_2020_3940692} and we are contributing +to upstream projects related to wikipedia citation extraction, such as +\emph{wikiciteparser}\footnote{\href{https://github.com/dissemin/wikiciteparser}{https://github.com/dissemin/wikiciteparser}} +to generate updates to the dataset. Table~\ref{table:structure} lists the +counts for these links. Additionally, we are examining web links appearing in +references: after an initial cleaning procedure we currently find 25,405,592 +web links\footnote{The cleaning process is necessary because OCR artifacts and +other metadata issues exist in the data. Unfortunately, even after cleaning not +all links will be in the form as originally intended by the authors.} in the +reference corpus, of which 4,827,688 have been preserved with an HTTP 200 +status code in the Wayback +Machine\footnote{\href{https://archive.org/web/}{https://archive.org/web/}} of +the Internet Archive. From a sample\footnote{In a sample of 8000 links we find +only 6138 responding with a HTTP 200, whereas the rest of the links yields a +variety of http status codes, like 404, 403, 500 and others.} we observe, that +about 23\% of the links reference corpus links preserved at the Internet +Archive are not accessible on the world wide web currently - making targeted +web crawling and preservation of scholarly references an essential tool for +maintaining citation integrity. + +% unpigz -c fatcat-refs-urllist-2021-06-17_lookup-20210714045637.tsv.gz| LC_ALL=C grep -F ')/' | grep -c -E "\W200\W" + \section{System Design} \subsection{Constraints} |