path: root/docs/TR-20210730212057-IA-WDS-CG/main.tex
diff options
authorMartin Czygan <martin.czygan@gmail.com>2021-07-30 22:40:04 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-30 22:40:04 +0200
commit53cf246ad220b29a7741853ca4ad4b2ce874b627 (patch)
treed8a65e8c946c9777c2f100e12c3453e5d7c7b6a8 /docs/TR-20210730212057-IA-WDS-CG/main.tex
parent629e5e8b25a68ca77213b1465affb689680b57ad (diff)
wip: paper
Diffstat (limited to 'docs/TR-20210730212057-IA-WDS-CG/main.tex')
1 files changed, 233 insertions, 0 deletions
diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex
new file mode 100644
index 0000000..a6b5cd7
--- /dev/null
+++ b/docs/TR-20210730212057-IA-WDS-CG/main.tex
@@ -0,0 +1,233 @@
+\usepackage[utf8]{inputenc} % allow utf-8 input
+\usepackage[T1]{fontenc} % use 8-bit T1 fonts
+\usepackage{hyperref} % hyperlinks
+\usepackage{url} % simple URL typesetting
+\usepackage{booktabs} % professional-quality tables
+\usepackage{amsfonts} % blackboard math symbols
+\usepackage{nicefrac} % compact symbols for 1/2, etc.
+\usepackage{microtype} % microtypography
+\usepackage{lipsum} % Can be removed after putting your text content
+\title{Internet Archive Scholar Citation Graph Dataset}
+\date{August 10, 2021} % Here you can change the date presented in the paper title
+%\date{} % Or removing it
+\author{ Martin Czygan \\
+ Internet Archive\\
+ San Francisco, CA 94118 \\
+ \texttt{martin@archive.org} \\
+ %% examples of more authors
+ \And
+ Bryan Newbold \\
+ Internet Archive\\
+ San Francisco, CA 94118 \\
+ \texttt{bnewbold@archive.org} \\
+ \And
+ Helge Holzmann \\
+ Internet Archive\\
+ San Francisco, CA 94118 \\
+ \texttt{helge@archive.org} \\
+ \And
+ Jefferson Bailey \\
+ Internet Archive\\
+ San Francisco, CA 94118 \\
+ \texttt{jefferson@archive.org} \\
+ %% \AND
+ %% Coauthor \\
+ %% Affiliation \\
+ %% Address \\
+ %% \texttt{email} \\
+ %% \And
+ %% Coauthor \\
+ %% Affiliation \\
+ %% Address \\
+ %% \texttt{email} \\
+ %% \And
+ %% Coauthor \\
+ %% Affiliation \\
+ %% Address \\
+ %% \texttt{email} \\
+% Uncomment to remove the date
+% Uncomment to override the `A preprint' in the header
+\renewcommand{\headeright}{Technical Report}
+\renewcommand{\undertitle}{Technical Report}
+% \renewcommand{\shorttitle}{\textit{arXiv} Template}
+%%% Add PDF metadata to help others organize their library
+%%% Once the PDF is generated, you can check the metadata with
+%%% $ pdfinfo template.pdf
+pdftitle={Internet Archive Scholar Citation Graph Dataset},
+pdfsubject={cs.DL, cs.IR},
+pdfauthor={Martin Czygan, Bryan Newbold, Helge Holzmann, Jefferson Bailey},
+pdfkeywords={Web Archiving, Citation Graph},
+As part of its scholarly data efforts, the Internet Archive releases a citation
+graph dataset derived from scholarly publications and additional data sources. It is
+composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related
+web-scale crawls targeting primary and secondary scholarly outputs. In
+addition, relations are worked out between scholarly publications, web pages
+and their archived copies, books from the Open Library project as well as
+Wikipedia articles.
+As of version "20210810", the graph consists of over X nodes
+and over Y edges. We release this dataset under a Z open license under the
+collection at \href{https://archive.org/details/citation\_graph}{https://archive.org/details/citation\_graph}, as well as all code
+used for derivation under an MIT license.
+% keywords can be removed
+\keywords{Citation Graph Dataset \and Scholarly Communications \and Web Archiving}
+The Internet Archive releases a first version of a citation graph dataset
+derived from a raw corpus of about 2.5B references gathered from metadata and
+from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}.
+The goal of this report is to describe briefly the current contents and the
+derivation of the Internet Archive Scholar Citation Graph Dataset (IASCG). We expect
+this dataset to be iterated upon, with changes both in content and processing.
+Modern citation indexes can be traced back to the early computing age, when
+projects like the Science Citation Index (1955)\citep{garfield2007evolution}
+were first devised, living on in existing commercial knowledge bases today.
+Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
+- the first version of which contained 6,325,178 individual
+references\citep{shotton2013publishing}. Other notable sources from that time
+include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}.
+% \section{Headings: first level}
+% \label{sec:headings}
+% \lipsum[4] See Section \ref{sec:headings}.
+% \subsection{Headings: second level}
+% \lipsum[5]
+% \begin{equation}
+% \xi _{ij}(t)=P(x_{t}=i,x_{t+1}=j|y,v,w;\theta)= {\frac {\alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}{\sum _{i=1}^{N} \sum _{j=1}^{N} \alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}}
+% \end{equation}
+% \subsubsection{Headings: third level}
+% \lipsum[6]
+% \paragraph{Paragraph}
+% \lipsum[7]
+% \section{Examples of citations, figures, tables, references}
+% \label{sec:others}
+% \subsection{Citations}
+% Citations use \verb+natbib+. The documentation may be found at
+% \begin{center}
+% \url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf}
+% \end{center}
+% Here is an example usage of the two main commands (\verb+citet+ and \verb+citep+): Some people thought a thing \citep{kour2014real, hadash2018estimate} but other people thought something else \citep{kour2014fast}. Many people have speculated that if we knew exactly why \citet{kour2014fast} thought this\dots
+% \subsection{Figures}
+% \lipsum[10]
+% See Figure \ref{fig:fig1}. Here is how you add footnotes. \footnote{Sample of the first footnote.}
+% \lipsum[11]
+% \begin{figure}
+% \centering
+% \fbox{\rule[-.5cm]{4cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
+% \caption{Sample figure caption.}
+% \label{fig:fig1}
+% \end{figure}
+% \subsection{Tables}
+% See awesome Table~\ref{tab:table}.
+% The documentation for \verb+booktabs+ (`Publication quality tables in LaTeX') is available from:
+% \begin{center}
+% \url{https://www.ctan.org/pkg/booktabs}
+% \end{center}
+% \begin{table}
+% \caption{Sample table title}
+% \centering
+% \begin{tabular}{lll}
+% \toprule
+% \multicolumn{2}{c}{Part} \\
+% \cmidrule(r){1-2}
+% Name & Description & Size ($\mu$m) \\
+% \midrule
+% Dendrite & Input terminal & $\sim$100 \\
+% Axon & Output terminal & $\sim$10 \\
+% Soma & Cell body & up to $10^6$ \\
+% \bottomrule
+% \end{tabular}
+% \label{tab:table}
+% \end{table}
+% \subsection{Lists}
+% \begin{itemize}
+% \item Lorem ipsum dolor sit amet
+% \item consectetur adipiscing elit.
+% \item Aliquam dignissim blandit est, in dictum tortor gravida eget. In ac rutrum magna.
+% \end{itemize}
+\bibliography{references} %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) .
+%%% Uncomment this section and comment out the \bibliography{references} line above to use inline references.
+% \begin{thebibliography}{1}
+% \bibitem{kour2014real}
+% George Kour and Raid Saabne.
+% \newblock Real-time segmentation of on-line handwritten arabic script.
+% \newblock In {\em Frontiers in Handwriting Recognition (ICFHR), 2014 14th
+% International Conference on}, pages 417--422. IEEE, 2014.
+% \bibitem{kour2014fast}
+% George Kour and Raid Saabne.
+% \newblock Fast classification of handwritten on-line arabic characters.
+% \newblock In {\em Soft Computing and Pattern Recognition (SoCPaR), 2014 6th
+% International Conference of}, pages 312--318. IEEE, 2014.
+% \bibitem{hadash2018estimate}
+% Guy Hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George Kour, and Alon
+% Jacovi.
+% \newblock Estimate and replace: A novel approach to integrating deep neural
+% networks with existing applications.
+% \newblock {\em arXiv preprint arXiv:1804.09028}, 2018.
+% \end{thebibliography}