From 53cf246ad220b29a7741853ca4ad4b2ce874b627 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Jul 2021 22:40:04 +0200 Subject: wip: paper --- docs/TR-20210730212057-IA-WDS-CG/main.tex | 233 ++++++++++++++++++++++++++++++ 1 file changed, 233 insertions(+) create mode 100644 docs/TR-20210730212057-IA-WDS-CG/main.tex (limited to 'docs/TR-20210730212057-IA-WDS-CG/main.tex') diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex new file mode 100644 index 0000000..a6b5cd7 --- /dev/null +++ b/docs/TR-20210730212057-IA-WDS-CG/main.tex @@ -0,0 +1,233 @@ +\documentclass{article} + + + +\usepackage{arxiv} + +\usepackage[utf8]{inputenc} % allow utf-8 input +\usepackage[T1]{fontenc} % use 8-bit T1 fonts +\usepackage{hyperref} % hyperlinks +\usepackage{url} % simple URL typesetting +\usepackage{booktabs} % professional-quality tables +\usepackage{amsfonts} % blackboard math symbols +\usepackage{nicefrac} % compact symbols for 1/2, etc. +\usepackage{microtype} % microtypography +\usepackage{lipsum} % Can be removed after putting your text content +\usepackage{graphicx} +\usepackage{natbib} +\usepackage{doi} + + + +\title{Internet Archive Scholar Citation Graph Dataset} + +\date{August 10, 2021} % Here you can change the date presented in the paper title +%\date{} % Or removing it + +\author{ Martin Czygan \\ + Internet Archive\\ + San Francisco, CA 94118 \\ + \texttt{martin@archive.org} \\ + %% examples of more authors + \And + Bryan Newbold \\ + Internet Archive\\ + San Francisco, CA 94118 \\ + \texttt{bnewbold@archive.org} \\ + \And + Helge Holzmann \\ + Internet Archive\\ + San Francisco, CA 94118 \\ + \texttt{helge@archive.org} \\ + \And + Jefferson Bailey \\ + Internet Archive\\ + San Francisco, CA 94118 \\ + \texttt{jefferson@archive.org} \\ + %% \AND + %% Coauthor \\ + %% Affiliation \\ + %% Address \\ + %% \texttt{email} \\ + %% \And + %% Coauthor \\ + %% Affiliation \\ + %% Address \\ + %% \texttt{email} \\ + %% \And + %% Coauthor \\ + %% Affiliation \\ + %% Address \\ + %% \texttt{email} \\ +} + +% Uncomment to remove the date +%\date{} + +% Uncomment to override the `A preprint' in the header +\renewcommand{\headeright}{Technical Report} +\renewcommand{\undertitle}{Technical Report} +% \renewcommand{\shorttitle}{\textit{arXiv} Template} + +%%% Add PDF metadata to help others organize their library +%%% Once the PDF is generated, you can check the metadata with +%%% $ pdfinfo template.pdf +\hypersetup{ +pdftitle={Internet Archive Scholar Citation Graph Dataset}, +pdfsubject={cs.DL, cs.IR}, +pdfauthor={Martin Czygan, Bryan Newbold, Helge Holzmann, Jefferson Bailey}, +pdfkeywords={Web Archiving, Citation Graph}, +} + +\begin{document} +\maketitle + +\begin{abstract} +As part of its scholarly data efforts, the Internet Archive releases a citation +graph dataset derived from scholarly publications and additional data sources. It is +composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related +web-scale crawls targeting primary and secondary scholarly outputs. In +addition, relations are worked out between scholarly publications, web pages +and their archived copies, books from the Open Library project as well as +Wikipedia articles. + +As of version "20210810", the graph consists of over X nodes +and over Y edges. We release this dataset under a Z open license under the +collection at \href{https://archive.org/details/citation\_graph}{https://archive.org/details/citation\_graph}, as well as all code +used for derivation under an MIT license. +\end{abstract} + + +% keywords can be removed +\keywords{Citation Graph Dataset \and Scholarly Communications \and Web Archiving} + + +\section{Introduction} + +The Internet Archive releases a first version of a citation graph dataset +derived from a raw corpus of about 2.5B references gathered from metadata and +from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}. +The goal of this report is to describe briefly the current contents and the +derivation of the Internet Archive Scholar Citation Graph Dataset (IASCG). We expect +this dataset to be iterated upon, with changes both in content and processing. + +Modern citation indexes can be traced back to the early computing age, when +projects like the Science Citation Index (1955)\citep{garfield2007evolution} +were first devised, living on in existing commercial knowledge bases today. +Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 +- the first version of which contained 6,325,178 individual +references\citep{shotton2013publishing}. Other notable sources from that time +include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. + + + +%\lipsum[2] +%\lipsum[3] + + +% \section{Headings: first level} +% \label{sec:headings} +% +% \lipsum[4] See Section \ref{sec:headings}. +% +% \subsection{Headings: second level} +% \lipsum[5] +% \begin{equation} +% \xi _{ij}(t)=P(x_{t}=i,x_{t+1}=j|y,v,w;\theta)= {\frac {\alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}{\sum _{i=1}^{N} \sum _{j=1}^{N} \alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}} +% \end{equation} +% +% \subsubsection{Headings: third level} +% \lipsum[6] +% +% \paragraph{Paragraph} +% \lipsum[7] +% +% +% +% \section{Examples of citations, figures, tables, references} +% \label{sec:others} +% +% \subsection{Citations} +% Citations use \verb+natbib+. The documentation may be found at +% \begin{center} +% \url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf} +% \end{center} +% +% Here is an example usage of the two main commands (\verb+citet+ and \verb+citep+): Some people thought a thing \citep{kour2014real, hadash2018estimate} but other people thought something else \citep{kour2014fast}. Many people have speculated that if we knew exactly why \citet{kour2014fast} thought this\dots +% +% \subsection{Figures} +% \lipsum[10] +% See Figure \ref{fig:fig1}. Here is how you add footnotes. \footnote{Sample of the first footnote.} +% \lipsum[11] +% +% \begin{figure} +% \centering +% \fbox{\rule[-.5cm]{4cm}{4cm} \rule[-.5cm]{4cm}{0cm}} +% \caption{Sample figure caption.} +% \label{fig:fig1} +% \end{figure} +% +% \subsection{Tables} +% See awesome Table~\ref{tab:table}. +% +% The documentation for \verb+booktabs+ (`Publication quality tables in LaTeX') is available from: +% \begin{center} +% \url{https://www.ctan.org/pkg/booktabs} +% \end{center} +% +% +% \begin{table} +% \caption{Sample table title} +% \centering +% \begin{tabular}{lll} +% \toprule +% \multicolumn{2}{c}{Part} \\ +% \cmidrule(r){1-2} +% Name & Description & Size ($\mu$m) \\ +% \midrule +% Dendrite & Input terminal & $\sim$100 \\ +% Axon & Output terminal & $\sim$10 \\ +% Soma & Cell body & up to $10^6$ \\ +% \bottomrule +% \end{tabular} +% \label{tab:table} +% \end{table} +% +% \subsection{Lists} +% \begin{itemize} +% \item Lorem ipsum dolor sit amet +% \item consectetur adipiscing elit. +% \item Aliquam dignissim blandit est, in dictum tortor gravida eget. In ac rutrum magna. +% \end{itemize} + + +\bibliographystyle{unsrtnat} +\bibliography{references} %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) . + + +%%% Uncomment this section and comment out the \bibliography{references} line above to use inline references. +% \begin{thebibliography}{1} + +% \bibitem{kour2014real} +% George Kour and Raid Saabne. +% \newblock Real-time segmentation of on-line handwritten arabic script. +% \newblock In {\em Frontiers in Handwriting Recognition (ICFHR), 2014 14th +% International Conference on}, pages 417--422. IEEE, 2014. + +% \bibitem{kour2014fast} +% George Kour and Raid Saabne. +% \newblock Fast classification of handwritten on-line arabic characters. +% \newblock In {\em Soft Computing and Pattern Recognition (SoCPaR), 2014 6th +% International Conference of}, pages 312--318. IEEE, 2014. + +% \bibitem{hadash2018estimate} +% Guy Hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George Kour, and Alon +% Jacovi. +% \newblock Estimate and replace: A novel approach to integrating deep neural +% networks with existing applications. +% \newblock {\em arXiv preprint arXiv:1804.09028}, 2018. + +% \end{thebibliography} + + +\end{document} -- cgit v1.2.3