aboutsummaryrefslogtreecommitdiffstats
path: root/docs/Simple
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-08-07 02:22:36 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-08-07 02:22:36 +0200
commit6b6c80be450f3f8eeca201a15c4c2b83386e2a4c (patch)
tree1492d834f98e51f9dec15e9ca3defc7cb07004fc /docs/Simple
parent07ff5c91d4fc25be81f584c151bbea46aee96f72 (diff)
downloadrefcat-6b6c80be450f3f8eeca201a15c4c2b83386e2a4c.tar.gz
refcat-6b6c80be450f3f8eeca201a15c4c2b83386e2a4c.zip
wip: change dataset name
Diffstat (limited to 'docs/Simple')
-rw-r--r--docs/Simple/main.pdfbin90674 -> 91045 bytes
-rw-r--r--docs/Simple/main.tex12
2 files changed, 8 insertions, 4 deletions
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
index 90bc6bd..c6311c8 100644
--- a/docs/Simple/main.pdf
+++ b/docs/Simple/main.pdf
Binary files differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index ea52b54..cbe4bc0 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -49,7 +49,7 @@ from the Open Library\footnote{\url{https://openlibrary.org}} project and
Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
graph consists of 1,323,423,672 citations. We release this dataset under a CC0
Public Domain Dedication, accessible through an archive
-collection\footnote{\url{https://archive.org/details/fatcat-asref-todo}}. All
+collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
code used in the derivation process is releases under an MIT
license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
\end{abstract}
@@ -111,9 +111,9 @@ As mentioned in \citep{hutchins2021tipping}, the number of openly available
citations is not expected to shrink in the future.
-\section{Citation Dataset}
+\section{Dataset}
-We release the first version of the Archive Scholar Reference (ASREF) dataset
+We release the first version of the Fatcat Reference dataset (refcat)
in an format used internally for storage and to serve queries (and which we
call \emph{biblioref} or \emph{bref} for short). The dataset includes metadata
from fatcat and the Open Library Project, links to archived pages in
@@ -148,11 +148,15 @@ seen in~\ref{table:cocicmp}.
A $\setminus$ C &
\end{tabular}
\vspace*{2mm}
- \caption{Comparison between COCI and ASREF-DOI, a subset of ASREF with DOI.}
+ \caption{Comparison between COCI and REFCAT-DOI, a subset of REFCAT where entities have a known DOI.}
\label{table:cocicmp}
\end{center}
\end{table}
+% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
+% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
+
TODO: how matches are established and a short note on overlap with COCI DOI.