From 769ee237046a8553583e0414e1f56877b7f1a847 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Thu, 5 Aug 2021 11:00:53 +0200
Subject: wip: paper

---
 docs/Simple/.gitignore |   5 +
 docs/Simple/Makefile   |   1 +
 docs/Simple/main.pdf   | Bin 0 -> 89394 bytes
 docs/Simple/main.tex   | 262 +++++++++++++++++++++++--------------------------
 docs/Simple/refs.bib   | 132 +++++++++++++++++++++++--
 5 files changed, 254 insertions(+), 146 deletions(-)
 create mode 100644 docs/Simple/.gitignore
 create mode 100644 docs/Simple/main.pdf

(limited to 'docs')

diff --git a/docs/Simple/.gitignore b/docs/Simple/.gitignore
new file mode 100644
index 0000000..5040d53
--- /dev/null
+++ b/docs/Simple/.gitignore
@@ -0,0 +1,5 @@
+*.log
+*.aux
+*.bbl
+*.blg
+*.out
diff --git a/docs/Simple/Makefile b/docs/Simple/Makefile
index e55adc9..d66a575 100644
--- a/docs/Simple/Makefile
+++ b/docs/Simple/Makefile
@@ -2,6 +2,7 @@ main.pdf: main.tex
 	pdflatex main.tex
 	bibtex main
 	pdflatex main.tex
+	pdflatex main.tex
 
 
 .PHONY: clean
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
new file mode 100644
index 0000000..067d829
Binary files /dev/null and b/docs/Simple/main.pdf differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index 7237262..920b3ac 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -1,99 +1,153 @@
-\documentclass[10pt,twocolumn]{article} 
+\documentclass[10pt,twocolumn]{article}
 \usepackage{simpleConference}
 \usepackage{times}
 \usepackage{graphicx}
+\usepackage{natbib}
+\usepackage{doi}
 \usepackage{amssymb}
 \usepackage{url,hyperref}
+\usepackage{booktabs}       % professional-quality tables
+\usepackage{amsfonts}       % blackboard math symbols
+\usepackage{nicefrac}       % compact symbols for 1/2, etc.
+
+\usepackage{datetime}
+\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
 
 \begin{document}
 
-\title{\LaTeX\ Template for a Simple, Two-Column Paper}
+\title{Archive Scholar Citation Dataset}
 
-\author{Iroro Orife \\
+\author{Martin Czygan \\
 \\
-Technical Report \\
-Seattle, Washington, USA \\
-\today
+Internet Archive \\
+San Francisco, California, USA \\
+martin@archive.org  \\
+\and
+Bryan Newbold \\
 \\
+Internet Archive \\
+San Francisco, California, USA \\
+bnewbold@archive.org  \\
 \\
-iroro@alumni.cmu.edu  \\
 }
 
+
 \maketitle
 \thispagestyle{empty}
 
-\begin{abstract}
-The material in this template is an edited \& \LaTeX\--ified version of the recommendations here: \url{http://cs.stanford.edu/people/widom/paper-writing.html} The objective would be to help us writers, stay on topic and focused for each section of a report.
 
-For the abstract state the problem, your approach and solution, and the main contributions of the paper. Include little if any background and motivation. Be factual but comprehensive. The material in the abstract should not be repeated later word for word in the paper. 
+\begin{abstract}
+As part of its scholarly data efforts, the Internet Archive releases a citation
+graph dataset derived from scholarly publications and additional data sources. It is
+composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related
+web-scale crawls targeting primary and secondary scholarly outputs. In
+addition, relations are worked out between scholarly publications, web pages
+and their archived copies, books from the Open Library project as well as
+Wikipedia articles. This first version of the graph consists of over X nodes
+and over Y edges. We release this dataset under a Z open license under the
+collection at \href{https://archive.org/details/TODO-citation\_graph}{https://archive.org/details/TODO-citation\_graph}, as well as all code
+used for derivation under an MIT license.
 \end{abstract}
 
+\keywords{Citation Graph, Web Archiving}
 
 \section{Introduction}
-The Introduction is crucially important. By the time a referee has finished the Introduction, he's probably made an initial decision about whether to accept or reject the paper. He'll read the rest of the paper looking for evidence to support his decision. A casual reader will continue on if the Introduction captivated him, and will set the paper aside otherwise. 
-Here is the Stanford InfoLab's patented five-point structure for Introductions. Unless there's a good argument against it, the Introduction should consist of five paragraphs answering the following five questions:
-
-\begin{description}
-  \item[$\bullet$]  What is the problem?
-  \item[$\bullet$]  Why is it interesting and important?
-  \item[$\bullet$]  Why is it hard? Why do naive approaches fail?
-  \item[$\bullet$]  Why hasn't it been solved before? What's wrong with previous proposed solutions? How does mine differ?
-  \item[$\bullet$]  What are the key components of my approach and results? Also include any specific limitations.
-\end{description}
-  
-Then have a final paragraph or subsection: ``Summary of Contributions". It should list the major contributions in bullet form, mentioning in which sections they can be found. This material doubles as an outline of the rest of the paper, saving space and eliminating redundancy.
 
-\section{Related Work}
-
-The perennial question: Should related work be covered near the beginning of the paper or near the end?
 
-\begin{description}
-  \item[$\bullet$]  Beginning, if it can be short yet detailed enough, or if it's critical to take a strong defensive stance about previous work right away. In this case Related Work can be either a subsection at the end of the Introduction, or its own Section 2.
-  \item[$\bullet$]  End, if it can be summarized quickly early on (in the Introduction or Preliminaries), or if sufficient comparisons require the technical content of the paper. In this case Related Work should appear just before the Conclusions, possibly in a more general section ``Discussion and Related Work".
-\end{description}
-
-\section{The Body}
-
-\textbf{Guideline 1:} A clear new important technical contribution should have been articulated by the time the reader finishes page 3 i.e., a quarter of the way through the paper.
-
-\textbf{Guideline 2:} Every section of the paper should tell a story. Don't, however, fall into the common trap of telling the entire story of how you arrived at your results. Just tell the story of the results themselves. The story should be linear, keeping the reader engaged at every step and looking forward to the next step. There should be no significant interruptions -- those can go in the Appendix.
-\\
-\\
-Aside from these guidelines, which apply to every paper, the structure of the body varies a lot depending on content. Important components are:
-
-\begin{description}
-  \item[$\bullet$]  Running Example: When possible, use a running example throughout the paper. It can be introduced either as a subsection at the end of the Introduction, or its own Section 2 or 3 (depending on Related Work).
-  \item[$\bullet$]  Preliminaries: This section, which follows the Introduction and possibly Related Work and/or Running Example, sets up notation and terminology that is not part of the technical contribution. One important function of this section is to delineate material that's not original but is needed for the paper. Be concise -- remember Guideline 1.
-    \item[$\bullet$] Content: The meat of the paper includes algorithms, system descriptions, new language constructs, analyses, etc. Whenever possible use a ``top-down" description: readers should be able to see where the material is going, and they should be able to skip ahead and still get the idea.
-\end{description}
-
-
-\section{Performance Experiments}
-
-We could have an entire treatise on this topic alone and I am surely not the expert. Here are some random thoughts:
-
-\begin{description}
-  \item[$\bullet$]  Many conferences expect experiments.
-  \item[$\bullet$]  It's easy to do ``hokey" or meaningless experiments, and many papers do.
-  \item[$\bullet$]  It's easy to craft experiments to show your work in its best light, and most papers do.
-    \item[$\bullet$]  What should performance experiments measure? Possibilities:  
-    \begin{description}
-  	  \item[$\bullet$] Pure running time
-      \item[$\bullet$] Sensitivity to important parameters
-      \item[$\bullet$] Scalability in various aspects: data size, problem complexity, ...
-  \end{description}
-    \item[$\bullet$]  What should performance experiments show? Possibilities:
-        \begin{description}
-  	  \item[$\bullet$] Absolute performance i.e., it's acceptable/usable
-      \item[$\bullet$] Relative performance to naive approaches
-      \item[$\bullet$] Relative performance to previous approaches
-      \item[$\bullet$] Relative performance among different proposed approaches
-  \end{description}
-  \item[$\bullet$] 
-\end{description}
+The Internet Archive releases a first version of a citation graph dataset
+derived from a raw corpus of about 2.5B references gathered from metadata and
+from data obtained by PDF extraction tools such as GROBID\cite{lopez2009grobid}.
+The goal of this report is to describe briefly the current contents and the
+derivation of the Archive Scholar Citations Dataset (ASC). We expect
+this dataset to be iterated upon, with changes both in content and processing.
+
+Modern citation indexes can be traced back to the early computing age, when
+projects like the Science Citation Index (1955)\citep{garfield2007evolution}
+were first devised, living on in existing commercial knowledge bases today.
+Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
+- the first version of which contained 6,325,178 individual
+references\citep{shotton2013publishing}. Other notable sources from that time
+include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
+decade has seen an increase of more openly available reference dataset and
+citation projects, like Microsoft Academic\citep{sinha2015overview} and
+Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
+according to \citep{hutchins2021tipping} over 1B citations are publicly
+available, marking a tipping point for open citations.
 
+\section{Related Work}
 
-\section{The Conclusions}
+\section{Citation Dataset}
+
+
+\section{System Design}
+
+The constraints for the systems design are informed by the volume and the
+variety of the data. In total, the raw inputs amount to a few TB of textual
+content, mostly newline delimited JSON. More importantly, while the number of
+data fields is low, certain schemas are very partial with hundreds of different
+combinations of available field values found in the raw reference data. This is
+most likely caused by aggregators passing on reference data coming from
+hundreds of sources, each of which not necessarily agreeing on a common
+granularity for citation data and from artifacts of machine learning based
+structured data extraction tools.
+
+Each combination of fields may require a slightly different processing path.
+For example, references with an Arxiv identifier can be processed differently
+from references with only a title. Over 50\% of the raw reference data comes
+from a set of eight field manifestations, as listed in
+Table~\ref{table:fields}.
+
+\begin{table}[]
+    \begin{center}
+    \begin{tabular}{ll}
+\toprule
+        \bf{Fields}                                    & \bf{Share} \\
+\midrule
+        \multicolumn{1}{l}{CN  CRN|P|T| U| V| Y}    & 14\%                              \\
+        \multicolumn{1}{l}{DOI}                 & 14\%                              \\
+        \multicolumn{1}{l}{CN|CRN|IS|P|T|U|V|Y} & 5\%                               \\
+        \multicolumn{1}{l}{CN|CRN|DOI|U|V|Y}    & 4\%                               \\
+        \multicolumn{1}{l}{PMID|U}              & 4\%                               \\
+        \multicolumn{1}{l}{CN|CRN|DOI|T|V|Y}    & 4\%                               \\
+        \multicolumn{1}{l}{CN|CRN|Y}            & 4\%                               \\
+        \multicolumn{1}{l}{CN|CRN|DOI|V|Y}      & 4\%                               \\
+    \end{tabular}
+    \vspace*{2mm}
+    \caption{Top 8 combinations of available fields in raw reference data
+        accounting for about 53\% of the total data (CN = container name, CRN =
+contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value.}
+    \label{table:fields}
+\end{center}
+\end{table}
+
+Overall, a map-reduce style approach is followed, which allows for some
+uniformity in the overall processing. We extract (key, document) tuples (as
+TSV) from the raw JSON data and sort by key. Then we group documents with the
+same key into groups and apply a function on each group in order to generate
+our target schema (currently named biblioref, or bref for short) or perform
+addition operations (such as deduplication).
+
+The key derivation can be exact (like an identifier like DOI, PMID, etc) or
+based on a normalization procedure, like a slugified title string. For
+identifier based matches we can generate the target biblioref schema directly.
+For fuzzy matching candidates, we pass possible match pairs through a
+verification procedure, which is implemented for release entity schema pairs.
+The current verification procedure is a domain dependent rule based
+verification, able to identify different versions of a publication,
+preprint-published pairs or or other kind of similar documents by calculating
+similarity metrics across title and authors. The fuzzy matching approach is
+applied on all reference documents, which only have a title, but no identifier.
+
+With a few schema conversions, fuzzy matching can be applied to Wikipedia
+articles and Open Library (edition) records as well. The aspect of precision
+and recall are represented by the two stages: we are generous in the match
+candidate generation phase in order to improve recall, but we are strict during
+verification, in order to control precision.
+
+
+\section{Fuzzy Matching Approach}
+\section{Quality Assurance}
 
 In general a short summarizing paragraph will do, and under no circumstances should the paragraph simply repeat material from the Abstract or Introduction. In some cases it's possible to now make the original claims more concrete, e.g., by referring to quantitative performance results.
 
@@ -105,81 +159,15 @@ This material is important -- part of the value of a paper is showing how the wo
 \item[$\bullet$]  Conversely, be aware that some researchers look to Future Work sections for research topics. My opinion is that there's nothing wrong with that -- consider it a compliment.
 \end{description}
 
-\section{The Acknowledgements}
+\section{Acknowledgements}
 
 Don't forget them or you'll have people with hurt feelings. Acknowledge anyone who contributed in any way: through discussions, feedback on drafts, implementation, etc. If in doubt about whether to include someone, include them.
 
 
 \section{Citations}
 
-Spend the effort to make all citations complete and consistent. Do not just copy random inconsistent BibTex (or other) entries from the web and call it a day. Check over your final bibliography carefully and make sure every entry looks right.
 
 \section{Appendix A}
-This is a simple sample of a document created using \LaTeX
-   (specifically pdflatex) that includes a figure from the Vergil visual editor for Ptolemy II
-   that was created by printing to the Acrobat Distiller to get a PDF file.
-   It also illustrates a simple two-column conference paper style,
-   and use of bibtex to handle bibligraphies.
-
-This is a sample document for use with pdflatex, which is
-a program that is included with the Miktex distribution
-that directly produces PDF files from \LaTeX sources.
-To run \LaTeX on this file, you need the following files:
-\begin{enumerate}
-\item templatePDF.tex (this file)
-\item figure.pdf (the figure file)
-\item simpleConference.sty (style file)
-\item refs.bib (bibiliography file)
-\end{enumerate}
-\noindent
-To create a PDF file, execute the following commands:
-\begin{enumerate}
-\item pdflatex mainTemplatePDF
-\item bibtex mainTemplatePDF
-\item pdflatex mainTemplatePDF
-\item pdflatex mainTemplatePDF
-\end{enumerate}
-\noindent
-Yes (strangely) it is necessary to run pdflatex three times.
-The result will be a PDF file (plus several other files that \LaTeX
-produces).  You will need a mechanism, of course, for executing
-commands on the command line. If you are using Windows, I recommend
-installing Cygwin and using its bash shell.
-
-\section{Appendix B: How to Include Vergil Diagrams as Figures}
-
-\begin{figure}[!b]
-  \begin{center}
-    \includegraphics[width=3.5in]{figure.pdf}
-  \end{center}
-
-  \caption{\small Figure caption. To get a figure to span two
-      columns, use the environment figure* rather than figure.}
-  \label{fig-label}
-\end{figure}
-
-
-Suppose you wish to include a figure, like that in figure \ref{fig-label}.
-The simplest mechanism is to install Adobe Acrobat, which includes
-a ``printer'' called ``Acrobat Distiller.'' Printing to this printer
-creates a PDF file, which can be included in a document as shown
-here.  To include Ptolemy II models \cite{PtolemyVol1:04},
-just print to the distiller from within Vergil and reference
-the PDF file in your \LaTeX document.
-
-There is a bit more work to do, however.
-The file that is produced by the distiller represents
-a complete page, not the individual figure.
-You can open it in using Acrobat (version 5.0 or later),
-and select Document $\rightarrow$ Crop Pages from the menu.
-In the resulting dialog, check ``Remove White Margins.''
-Save the modified PDF file in a file and then reference
-it in the \LaTeX file as shown in this example.
-
-An alternative is to generate EPS (encapsulated postscript),
-but the process is much more complex and fragile.
-I recommend using pdflatex and Adobe Acrobat.
-
 \bibliographystyle{abbrv}
 \bibliography{refs}
 \end{document}
diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib
index b2328b0..bcb8a16 100644
--- a/docs/Simple/refs.bib
+++ b/docs/Simple/refs.bib
@@ -1,9 +1,123 @@
-@techreport{PtolemyVol1:04,
-   Author = {Brooks, C. and Lee, E. A. and Liu, X. and Neuendorffer, S. and Zhao, Y. and Zheng, H.},
-   Title = {Heterogeneous Concurrent Modeling and Design in Java},
-   Institution = {University of California},
-   Number = {Technical Memorandum UCB/ERL M04/27},
-   Month= {July 29},
-   URL ={http://ptolemy.eecs.berkeley.edu/publications/papers/04/ptIIDesignIntro/},
-   Year = {2004}
-}
\ No newline at end of file
+@inproceedings{kour2014real,
+  title={Real-time segmentation of on-line handwritten arabic script},
+  author={Kour, George and Saabne, Raid},
+  booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on},
+  pages={417--422},
+  year={2014},
+  organization={IEEE}
+}
+
+@inproceedings{kour2014fast,
+  title={Fast classification of handwritten on-line Arabic characters},
+  author={Kour, George and Saabne, Raid},
+  booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of},
+  pages={312--318},
+  year={2014},
+  organization={IEEE},
+  doi={10.1109/SOCPAR.2014.7008025}
+}
+
+@article{hadash2018estimate,
+  title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications},
+  author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon},
+  journal={arXiv preprint arXiv:1804.09028},
+  year={2018}
+}
+
+@article{garfield1955citation,
+  title={Citation indexes for science},
+  author={Garfield, Eugene},
+  journal={Science},
+  volume={122},
+  number={3159},
+  pages={108--111},
+  year={1955},
+  publisher={JSTOR}
+}
+
+@inproceedings{lopez2009grobid,
+  title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications},
+  author={Lopez, Patrice},
+  booktitle={International conference on theory and practice of digital libraries},
+  pages={473--474},
+  year={2009},
+  organization={Springer}
+}
+
+@article{garfield2007evolution,
+  title={The evolution of the science citation index},
+  author={Garfield, Eugene},
+  journal={International microbiology},
+  volume={10},
+  number={1},
+  pages={65},
+  year={2007}
+}
+
+@article{shotton2013publishing,
+  title={Publishing: open citations},
+  author={Shotton, David},
+  journal={Nature News},
+  volume={502},
+  number={7471},
+  pages={295},
+  year={2013}
+}
+
+@misc{CitEc,
+  title = {Citations in Economics},
+  howpublished = {\url{https://citec.repec.org/}},
+  note = {Accessed: 2021-07-30}
+}
+
+@inproceedings{wu2019citeseerx,
+  title={CiteSeerX: 20 years of service to scholarly big data},
+  author={Wu, Jian and Kim, Kunho and Giles, C Lee},
+  booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse},
+  pages={1--4},
+  year={2019}
+}
+
+@inproceedings{sinha2015overview,
+  title={An overview of microsoft academic service (mas) and applications},
+  author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan},
+  booktitle={Proceedings of the 24th international conference on world wide web},
+  pages={243--246},
+  year={2015}
+}
+
+@misc{i4oc,
+	title = {Initiative for Open Citations},
+howpublished = {\url{https://i4oc.org/}},
+note = {Accessed: 2021-07-30}
+}
+
+@article{shotton2018funders,
+  title={Funders should mandate open citations.},
+  author={Shotton, David},
+  journal={Nature},
+  volume={553},
+  number={7686},
+  pages={129--130},
+  year={2018},
+  publisher={Nature Publishing Group}
+}
+
+@article{hutchins2021tipping,
+  title={A tipping point for open citation data},
+  author={Hutchins, B Ian},
+  journal={Quantitative Science Studies},
+  pages={1--5},
+  year={2021}
+}
+
+@article{silbert1970world,
+  title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)},
+  author={Silbert, Jeffrey M},
+  journal={Criminology},
+  volume={8},
+  pages={107},
+  year={1970},
+  publisher={HeinOnline}
+}
+
-- 
cgit v1.2.3