From bd66b58cded2c2c7e7b7e5d374434d6531dd70de Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Sun, 8 Aug 2021 15:18:29 +0200
Subject: docs: cleanup and naming

---
 docs/Simple/.gitignore                             |   5 -
 docs/Simple/LICENSE                                |  21 -
 docs/Simple/Makefile                               |  17 -
 docs/Simple/README.md                              |   2 -
 docs/Simple/figure.pdf                             | Bin 215353 -> 0 bytes
 docs/Simple/main.pdf                               | Bin 95636 -> 0 bytes
 docs/Simple/main.tex                               | 362 -----------------
 docs/Simple/refs.bib                               | 228 -----------
 docs/Simple/simpleConference.sty                   | 136 -------
 docs/TR-20210730212057-IA-WDS-CG/.gitignore        |   5 -
 docs/TR-20210730212057-IA-WDS-CG/Makefile          |   9 -
 docs/TR-20210730212057-IA-WDS-CG/README.md         |  49 ---
 docs/TR-20210730212057-IA-WDS-CG/arxiv.sty         | 262 ------------
 docs/TR-20210730212057-IA-WDS-CG/main.pdf          | Bin 99346 -> 0 bytes
 docs/TR-20210730212057-IA-WDS-CG/main.tex          | 442 ---------------------
 docs/TR-20210730212057-IA-WDS-CG/references.bib    | 123 ------
 docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore    |   5 +
 docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE       |  21 +
 docs/TR-20210808100000-IA-WDS-REFCAT/Makefile      |  17 +
 docs/TR-20210808100000-IA-WDS-REFCAT/README.md     |   2 +
 docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf    | Bin 0 -> 215353 bytes
 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf      | Bin 0 -> 95636 bytes
 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex      | 362 +++++++++++++++++
 docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib      | 228 +++++++++++
 .../simpleConference.sty                           | 136 +++++++
 25 files changed, 771 insertions(+), 1661 deletions(-)
 delete mode 100644 docs/Simple/.gitignore
 delete mode 100644 docs/Simple/LICENSE
 delete mode 100644 docs/Simple/Makefile
 delete mode 100644 docs/Simple/README.md
 delete mode 100644 docs/Simple/figure.pdf
 delete mode 100644 docs/Simple/main.pdf
 delete mode 100644 docs/Simple/main.tex
 delete mode 100644 docs/Simple/refs.bib
 delete mode 100644 docs/Simple/simpleConference.sty
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/.gitignore
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/Makefile
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/README.md
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/arxiv.sty
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/main.pdf
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/main.tex
 delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/references.bib
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/Makefile
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/README.md
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
 create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty

diff --git a/docs/Simple/.gitignore b/docs/Simple/.gitignore
deleted file mode 100644
index 5040d53..0000000
--- a/docs/Simple/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-*.log
-*.aux
-*.bbl
-*.blg
-*.out
diff --git a/docs/Simple/LICENSE b/docs/Simple/LICENSE
deleted file mode 100644
index 9f5c70f..0000000
--- a/docs/Simple/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017 Ruoho Ruotsi
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/docs/Simple/Makefile b/docs/Simple/Makefile
deleted file mode 100644
index 11264f8..0000000
--- a/docs/Simple/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-main.pdf: main.tex
-	latexindent -w main.tex && rm -f main.bak*
-	pdflatex main.tex
-	bibtex main
-	pdflatex main.tex
-	pdflatex main.tex
-
-
-.PHONY: clean
-clean:
-	rm -f main.pdf
-	rm -f main.aux
-	rm -f main.log
-	rm -f main.bbl
-	rm -f main.blg
-	rm -f main.out
-
diff --git a/docs/Simple/README.md b/docs/Simple/README.md
deleted file mode 100644
index 3a56517..0000000
--- a/docs/Simple/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# latex-template-arxiv-preprint
-A simple LaTeX template for Technical Reports, arXiv preprints &amp; 2-column Conference papers
diff --git a/docs/Simple/figure.pdf b/docs/Simple/figure.pdf
deleted file mode 100644
index b21876a..0000000
Binary files a/docs/Simple/figure.pdf and /dev/null differ
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
deleted file mode 100644
index 3b431cc..0000000
Binary files a/docs/Simple/main.pdf and /dev/null differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
deleted file mode 100644
index e4febd9..0000000
--- a/docs/Simple/main.tex
+++ /dev/null
@@ -1,362 +0,0 @@
-\documentclass[hidelinks,10pt,twocolumn]{article}
-\usepackage{simpleConference}
-\usepackage[utf8]{inputenc}
-\usepackage{times}
-\usepackage{graphicx}
-\usepackage{natbib}
-\usepackage{doi}
-\usepackage{amssymb}
-\usepackage{url,hyperref}
-\usepackage{booktabs}       % professional-quality tables
-\usepackage{amsfonts}       % blackboard math symbols
-\usepackage{nicefrac}       % compact symbols for 1/2, etc.
-\usepackage{caption}
-
-\usepackage{datetime}
-\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
-\setlength{\parindent}{0pt}
-
-\begin{document}
-
-\title{Fatcat Reference Dataset}
-
-\author{Martin Czygan \\
-	\\
-	Internet Archive \\
-	San Francisco, California, USA \\
-	martin@archive.org  \\
-	\and
-	Bryan Newbold \\
-	\\
-	Internet Archive \\
-	San Francisco, California, USA \\
-	bnewbold@archive.org  \\
-	\\
-}
-
-
-\maketitle
-\thispagestyle{empty}
-
-
-\begin{abstract}
-	As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
-	graph dataset, named \emph{refcat}, derived from scholarly publications and
-	additional data sources. It is composed of data gathered by the fatcat
-	cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
-	crawls targeting primary and secondary scholarly outputs, as well as metadata
-	from the Open Library\footnote{\url{https://openlibrary.org}} project and
-	Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
-	graph consists of 1,323,423,672 citations. We release this dataset under a CC0
-	Public Domain Dedication, accessible through an archive
-	item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
-	code used in the derivation process is released under an MIT
-	license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
-\end{abstract}
-
-\keywords{Citation Graph, Web Archiving}
-
-\section{Introduction}
-
-
-The Internet Archive releases a first version of a citation graph dataset
-derived from a raw corpus of about 2.5B references gathered from metadata and
-data obtained by PDF extraction tools such as
-GROBID\cite{lopez2009grobid}. Additionally, we consider integration with
-metadata from Open Library and Wikipedia.
-The goal of this report is to describe briefly the current contents and the
-derivation of the dataset. We expect
-this dataset to be iterated upon, with changes both in content and processing.
-
-Modern citation indexes can be traced back to the early computing age, when
-projects like the Science Citation Index (1955)\citep{garfield2007evolution}
-were first devised, living on in existing commercial knowledge bases today.
-Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
-- the first version of which contained 6,325,178 individual
-references\citep{shotton2013publishing}. Other notable early projects
-include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
-decade has seen the emergence of more openly available, large scale
-citation projects like Microsoft Academic\citep{sinha2015overview} or the
-Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
-according to \citep{hutchins2021tipping} over 1B citations are publicly
-available, marking a tipping point for this category of data.
-
-\section{Related Work}
-
-There are a few large scale citation dataset available today. COCI, the
-``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first
-released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on
-2021-07-29, it contains
-1,094,394,688 citations across 65,835,422 bibliographic
-resources\citep{peroni2020opencitations}.
-
-The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
-``a Wikimedia initiative to develop open citations and linked bibliographic
-data to serve free knowledge'' continously adds citations to its database and
-as of 2021-06-28 tracks 253,719,394 citations across 39,994,937
-publications\footnote{\url{http://wikicite.org/statistics.html}}.
-
-Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of
-entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
-with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
-	\url{https://archive.org/details/mag-2021-06-07}}  the
-\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
-bibliographic entities.
-
-Numerous other projects have been or are concerned with various aspects of
-citation discovery and curation as part their feature set, among them Semantic
-Scholar\citep{fricke2018semantic}, CiteSeerX\citep{li2006citeseerx} or Aminer\citep{tang2016aminer}.
-
-As mentioned in \citep{hutchins2021tipping}, the number of openly available
-citations is not expected to shrink in the future.
-
-
-\section{Dataset}
-
-We release the first version of the \emph{refcat} dataset in an format used
-internally for storage and to serve queries (and which we call \emph{biblioref}
-or \emph{bref} for short). The dataset includes metadata from fatcat, the
-Open Library Project and inbound links from the English Wikipedia. The fatcat
-project itself aggregates data from variety of open data sources, such as
-Crossref\citep{crossref}, PubMed\citep{canese2013pubmed},
-DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others,
-as well as metadata generated from analysis of data preserved at the Internet
-Archive and active crawls of publication sites on the web.
-
-The dataset is
-integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users
-to explore inbound and outbound references\cite{fatcatguidereferencegraph}.
-
-The format records source and target (fatcat release and work) identifiers, a
-few attributes from the metadata (such as year or release stage) as well as
-information about the match status and provanance.
-
-The dataset currently contains 1,323,423,672 citations across 76,327,662
-entities (55,123,635 unique source and 60,244,206 unique target work
-identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI
-for both source and target).
-The majority of matches - 1,250,523,321 - are established through identifier
-based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are
-established through fuzzy matching techniques.
-
-The majority of citations between \emph{refcat} and COCI overlap, as can be
-seen in~Table~\ref{table:cocicmp}.
-
-\begin{table}[]
-	\begin{center}
-		\begin{tabular}{ll}
-			\toprule
-			\bf{Set}              & \bf{Count}    \\
-
-			\midrule
-			COCI (C)              & 1,094,394,688 \\
-			\emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
-			C $\cap$ R            & 1,007,539,966 \\
-			C $\setminus$ R       & 86,854,309    \\
-			R $\setminus$ C       & 295,884,246
-		\end{tabular}
-		\vspace*{2mm}
-		\caption{Comparison between COCI and \emph{refcat-doi}, a subset of
-			\emph{refcat} where entities have a known DOI. At least 50\% of the
-			295,884,246 references only in \emph{refcat-doi} come from links
-			recorded within a specific dataset provider (GBIF, DOI prefix:
-			10.15468).}
-		\label{table:cocicmp}
-	\end{center}
-\end{table}
-
-% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
-% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
-% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
-
-
-\section{System Design}
-
-The constraints for the systems design are informed by the volume and the
-variety of the data. The capability to run the whole graph derivation on a
-single machine was a minor goal as well. In total, the raw inputs amount to a
-few terabytes of textual content, mostly newline delimited JSON. More
-importantly, while the number of data fields is low, certain schemas are very
-partial with hundreds of different combinations of available field values found
-in the raw reference data. This is most likely caused by aggregators passing on
-reference data coming from hundreds of sources, each of which not necessarily
-agreeing on a common granularity for citation data and from artifacts of
-machine learning based structured data extraction tools.
-
-Each combination of fields may require a slightly different processing path.
-For example, references with an Arxiv identifier can be processed differently
-from references with only a title. Over 50\% of the raw reference data comes
-from a set of eight field set manifestations, as listed in
-Table~\ref{table:fields}.
-
-\begin{table}[]
-	\begin{center}
-		\begin{tabular}{ll}
-			\toprule
-			\bf{Fields}                                                                                     & \bf{Percentage} \\
-			\midrule
-			\multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}           & 14\%            \\
-			\multicolumn{1}{l}{\textbf{DOI}}                                                                & 14\%            \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}           & 4\%             \\
-			\multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}                                                     & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}           & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}                                                    & 4\%             \\
-			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}                     & 4\%             \\
-		\end{tabular}
-		\vspace*{2mm}
-		\caption{Top 8 combinations of available fields in raw reference data
-			accounting for about 53\% of the total data (CN = container name, CRN =
-			contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
-			issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
-		\label{table:fields}
-	\end{center}
-\end{table}
-
-Overall, a map-reduce style\citep{dean2010mapreduce} approach is
-followed\footnote{While the operations are similar, the processing is not
-	distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
-for some
-uniformity in the overall processing. We extract (key, document) tuples (as
-TSV) from the raw JSON data and sort by key. We then group documents with the
-same key and apply a function on each group in order to generate
-our target schema or perform
-additional operations such as deduplication or fusion of matched and unmatched references.
-
-The key derivation can be exact (via an identifier like DOI, PMID, etc) or
-based on a value normalization, like slugifying a title string. For identifier
-based matches we can generate the target schema directly.  For fuzzy matching
-candidates, we pass possible match pairs through a verification procedure,
-which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a
-domain dependent rule based verification, able to identify different versions
-of a publication, preprint-published pairs and documents, which are
-are similar by various metrics calculated over title and author fields. The fuzzy matching
-approach is applied on all reference documents without identifier (a title is
-currently required).
-
-With a few schema conversions, fuzzy matching can be applied to Wikipedia
-articles and Open Library (edition) records as well. The aspect of precision
-and recall are represented by the two stages: we are generous in the match
-candidate generation phase in order to improve recall, but we are strict during
-verification, in order to control precision. Quality assurance for verification is
-implemented through a growing list of test cases of real examples from the catalog and
-their expected or desired match status\footnote{The list can be found under:
-	\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
-	It is helpful to keep this test suite independent of any specific programming language.}.
-
-
-\section{Limitations and Future Work}
-
-As other dataset in this field we expect this dataset to be iterated upon.
-
-\begin{itemize}
-	\item The fatcat catalog updates its metadata
-	      continously\footnote{A changelog can currenly be followed here:
-		      \url{https://fatcat.wiki/changelog}} and web crawls are conducted
-	      regularly.  Current processing pipelines cover raw reference snapshot
-	      creation and derivation of the graph structure, which allows to rerun
-	      processing based on updated data as it becomes available.
-
-	\item Metadata extraction from PDFs depends on supervised machine learning
-	      models, which in turn depend on available training datasets. With additional crawls and
-	      metadata available we hope to improve models used for metadata
-	      extraction, improving yield and reducing data extraction artifacts in
-	      the process.
-
-	\item As of this version, a number of raw reference
-	      docs remain unmatched, which means that neither exact nor fuzzy matching
-	      has detected a link to a known entity. On the one
-	      hand, this can hint at missing metadata. However, parts of the data
-	      will contain a reference to a catalogued entity, but in a specific,
-	      dense and harder to recover form.
-	      This also include improvements to the fuzzy matching approach.
-	\item The reference dataset contains millions of URLs and their integration
-	      into the graph has been implemented as prototype. A full implementation
-	      requires a few data cleanup and normalization steps.
-\end{itemize}
-
-\section{Acknowledgements}
-
-This work is partially supported by a grant from the \emph{Andrew W. Mellon
-	Foundation}.
-
-
-\section{Appendix A}
-
-
-A note on data quality: While we implement various data quality measures,
-real-world data, especially coming from many different sources will contain
-issues. Among other measures, we keep track of match reasons,
-especially for fuzzy matching to be able to zoom in on systematic errors
-more easily (see~Table~\ref{table:matches}).
-
-\begin{table}[]
-	\footnotesize
-	\captionsetup{font=normalsize}
-	\begin{center}
-		\begin{tabular}{@{}rlll@{}}
-			\toprule
-			\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason}      \\ \midrule
-			934932865      & crossref            & exact           & doi                  \\
-			151366108      & fatcat-datacite     & exact           & doi                  \\
-			65345275       & fatcat-pubmed       & exact           & pmid                 \\
-			48778607       & fuzzy               & strong          & jaccardauthors       \\
-			42465250       & grobid              & exact           & doi                  \\
-			29197902       & fatcat-pubmed       & exact           & doi                  \\
-			19996327       & fatcat-crossref     & exact           & doi                  \\
-			11996694       & fuzzy               & strong          & slugtitleauthormatch \\
-			9157498        & fuzzy               & strong          & tokenizedauthors     \\
-			3547594        & grobid              & exact           & arxiv                \\
-			2310025        & fuzzy               & exact           & titleauthormatch     \\
-			1496515        & grobid              & exact           & pmid                 \\
-			680722         & crossref            & strong          & jaccardauthors       \\
-			476331         & fuzzy               & strong          & versioneddoi         \\
-			449271         & grobid              & exact           & isbn                 \\
-			230645         & fatcat-crossref     & strong          & jaccardauthors       \\
-			190578         & grobid              & strong          & jaccardauthors       \\
-			156657         & crossref            & exact           & isbn                 \\
-			123681         & fatcat-pubmed       & strong          & jaccardauthors       \\
-			79328          & crossref            & exact           & arxiv                \\
-			57414          & crossref            & strong          & tokenizedauthors     \\
-			53480          & fuzzy               & strong          & pmiddoipair          \\
-			52453          & fuzzy               & strong          & dataciterelatedid    \\
-			47119          & grobid              & strong          & slugtitleauthormatch \\
-			36774          & fuzzy               & strong          & arxivversion         \\
-			% 35311          & fuzzy               & strong          & customieeearxiv      \\
-			% 33863          & grobid              & exact           & pmcid                \\
-			% 23504          & crossref            & strong          & slugtitleauthormatch \\
-			% 22753          & fatcat-crossref     & strong          & tokenizedauthors     \\
-			% 17720          & grobid              & exact           & titleauthormatch     \\
-			% 14656          & crossref            & exact           & titleauthormatch     \\
-			% 14438          & grobid              & strong          & tokenizedauthors     \\
-			% 7682           & fatcat-crossref     & exact           & arxiv                \\
-			% 5972           & fatcat-crossref     & exact           & isbn                 \\
-			% 5525           & fatcat-pubmed       & exact           & arxiv                \\
-			% 4290           & fatcat-pubmed       & strong          & tokenizedauthors     \\
-			% 2745           & fatcat-pubmed       & exact           & isbn                 \\
-			% 2342           & fatcat-pubmed       & strong          & slugtitleauthormatch \\
-			% 2273           & fatcat-crossref     & strong          & slugtitleauthormatch \\
-			% 1960           & fuzzy               & exact           & workid               \\
-			% 1150           & fatcat-crossref     & exact           & titleauthormatch     \\
-			% 1041           & fatcat-pubmed       & exact           & titleauthormatch     \\
-			% 895            & fuzzy               & strong          & figshareversion      \\
-			% 317            & fuzzy               & strong          & titleartifact        \\
-			% 82             & grobid              & strong          & titleartifact        \\
-			% 33             & crossref            & strong          & titleartifact        \\
-			% 5              & fuzzy               & strong          & custombsiundated     \\
-			% 1              & fuzzy               & strong          & custombsisubdoc      \\
-			% 1              & fatcat              & exact           & doi                  \\ \bottomrule
-		\end{tabular}
-		\vspace*{2mm}
-		\caption{Table of match counts (top 25), reference provenance, match status and
-			match reason. The match reason identifier encode a specific rule in the domain
-			dependent verification process and are included for completeness - we do not
-			include the details of each rule in this report.}
-		\label{table:matches}
-	\end{center}
-\end{table}
-
-\bibliographystyle{abbrv}
-% \bibliographystyle{plainnat}
-\bibliography{refs}
-\end{document}
diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib
deleted file mode 100644
index c61021e..0000000
--- a/docs/Simple/refs.bib
+++ /dev/null
@@ -1,228 +0,0 @@
-@inproceedings{kour2014real,
-  title={Real-time segmentation of on-line handwritten arabic script},
-  author={Kour, George and Saabne, Raid},
-  booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on},
-  pages={417--422},
-  year={2014},
-  organization={IEEE}
-}
-
-@inproceedings{kour2014fast,
-  title={Fast classification of handwritten on-line Arabic characters},
-  author={Kour, George and Saabne, Raid},
-  booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of},
-  pages={312--318},
-  year={2014},
-  organization={IEEE},
-  doi={10.1109/SOCPAR.2014.7008025}
-}
-
-@article{hadash2018estimate,
-  title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications},
-  author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon},
-  journal={arXiv preprint arXiv:1804.09028},
-  year={2018}
-}
-
-@article{garfield1955citation,
-  title={Citation indexes for science},
-  author={Garfield, Eugene},
-  journal={Science},
-  volume={122},
-  number={3159},
-  pages={108--111},
-  year={1955},
-  publisher={JSTOR}
-}
-
-@inproceedings{lopez2009grobid,
-  title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications},
-  author={Lopez, Patrice},
-  booktitle={International conference on theory and practice of digital libraries},
-  pages={473--474},
-  year={2009},
-  organization={Springer}
-}
-
-@article{garfield2007evolution,
-  title={The evolution of the science citation index},
-  author={Garfield, Eugene},
-  journal={International microbiology},
-  volume={10},
-  number={1},
-  pages={65},
-  year={2007}
-}
-
-@article{shotton2013publishing,
-  title={Publishing: open citations},
-  author={Shotton, David},
-  journal={Nature News},
-  volume={502},
-  number={7471},
-  pages={295},
-  year={2013}
-}
-
-@misc{CitEc,
-  title = {Citations in Economics},
-  howpublished = {\url{https://citec.repec.org/}},
-  note = {Accessed: 2021-07-30}
-}
-
-@inproceedings{wu2019citeseerx,
-  title={CiteSeerX: 20 years of service to scholarly big data},
-  author={Wu, Jian and Kim, Kunho and Giles, C Lee},
-  booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse},
-  pages={1--4},
-  year={2019}
-}
-
-@inproceedings{li2006citeseerx,
-  title={CiteSeerx: an architecture and web service design for an academic document search engine},
-  author={Li, Huajing and Councill, Isaac and Lee, Wang-Chien and Giles, C Lee},
-  booktitle={Proceedings of the 15th international conference on World Wide Web},
-  pages={883--884},
-  year={2006}
-}
-
-
-@inproceedings{sinha2015overview,
-  title={An overview of microsoft academic service (mas) and applications},
-  author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan},
-  booktitle={Proceedings of the 24th international conference on world wide web},
-  pages={243--246},
-  year={2015}
-}
-
-@misc{i4oc,
-	title = {Initiative for Open Citations},
-howpublished = {\url{https://i4oc.org/}},
-note = {Accessed: 2021-07-30}
-}
-
-@misc{fatcatguidereferencegraph,
-title = {The Fatcat Guide: Reference Graph (refcat)},
-howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}},
-note = {Accessed: 2021-08-08}
-}
-
-@misc{crossref,
-title = {Crossref},
-howpublished = {\url{https://crossref.org}},
-note = {Accessed: 2021-08-08}
-}
-
-@misc{doaj,
-title = {Directory of Open Access Journals},
-howpublished = {\url{https://doaj.org}},
-note = {Accessed: 2021-08-08}
-}
-
-@inproceedings{ley2002dblp,
-  title={The DBLP computer science bibliography: Evolution, research issues, perspectives},
-  author={Ley, Michael},
-  booktitle={International symposium on string processing and information retrieval},
-  pages={1--10},
-  year={2002},
-  organization={Springer}
-}
-
-
-@inproceedings{brase2009datacite,
-  title={DataCite-A global registration agency for research data},
-  author={Brase, Jan},
-  booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology},
-  pages={257--261},
-  year={2009},
-  organization={IEEE}
-}
-
-@article{canese2013pubmed,
-  title={PubMed: the bibliographic database},
-  author={Canese, Kathi and Weis, Sarah},
-  journal={The NCBI Handbook},
-  volume={2},
-  pages={1},
-  year={2013},
-  publisher={National Center for Biotechnology Information (US)}
-}
-
-
-@article{shotton2018funders,
-  title={Funders should mandate open citations.},
-  author={Shotton, David},
-  journal={Nature},
-  volume={553},
-  number={7686},
-  pages={129--130},
-  year={2018},
-  publisher={Nature Publishing Group}
-}
-
-@article{hutchins2021tipping,
-  title={A tipping point for open citation data},
-  author={Hutchins, B Ian},
-  journal={Quantitative Science Studies},
-  pages={1--5},
-  year={2021}
-}
-
-@article{silbert1970world,
-  title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)},
-  author={Silbert, Jeffrey M},
-  journal={Criminology},
-  volume={8},
-  pages={107},
-  year={1970},
-  publisher={HeinOnline}
-}
-
-@article{peroni2020opencitations,
-  title={OpenCitations, an infrastructure organization for open scholarship},
-  author={Peroni, Silvio and Shotton, David},
-  journal={Quantitative Science Studies},
-  volume={1},
-  number={1},
-  pages={428--444},
-  year={2020},
-  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
-}
-
-@article{fricke2018semantic,
-  title={Semantic scholar},
-  author={Fricke, Suzanne},
-  journal={Journal of the Medical Library Association: JMLA},
-  volume={106},
-  number={1},
-  pages={145},
-  year={2018},
-  publisher={Medical Library Association}
-}
-
-@inproceedings{tang2016aminer,
-  title={AMiner: Toward understanding big scholar data},
-  author={Tang, Jie},
-  booktitle={Proceedings of the ninth ACM international conference on web search and data mining},
-  pages={467--467},
-  year={2016}
-}
-
-@article{dean2010mapreduce,
-  title={MapReduce: a flexible data processing tool},
-  author={Dean, Jeffrey and Ghemawat, Sanjay},
-  journal={Communications of the ACM},
-  volume={53},
-  number={1},
-  pages={72--77},
-  year={2010},
-  publisher={ACM New York, NY, USA}
-}
-
-@article{collet2018zstandard,
-  title={Zstandard Compression and the application/zstd Media Type},
-  author={Collet, Yann and Kucherawy, Murray},
-  journal={RFC 8478},
-  year={2018}
-}
-
diff --git a/docs/Simple/simpleConference.sty b/docs/Simple/simpleConference.sty
deleted file mode 100644
index d4d4764..0000000
--- a/docs/Simple/simpleConference.sty
+++ /dev/null
@@ -1,136 +0,0 @@
-% ---------------------------------------------------------------
-% Style file for simple, two column conference papers.
-% Based on latex8.sty by Paolo.Ienne@di.epfl.ch
-% ---------------------------------------------------------------
-% Use with LaTeX2e as:
-%   \documentclass[times,10pt,twocolumn]{article}
-%   \usepackage{simpleConference}
-%   \usepackage{times}
-% ---------------------------------------------------------------
-% specify references as
-%   \bibliographystyle{simpleConference}
-%   \bibliography{...your files...}
-%
-% use Section{} and SubSection{} instead of standard section{}
-%    and subsection{} to obtain headings in the form
-%    "1.3. My heading"
-% ---------------------------------------------------------------
-% ten point helvetica bold required for captions
-% in some sites the name of the helvetica bold font may differ,
-% change the name here:
-\font\tenhv  = phvb at 10pt
-
-% eleven point times bold required for second-order headings
-\font\elvbf  = ptmb scaled 1100
-
-% set dimensions of columns, gap between columns, and paragraph indent
-\setlength{\textheight}{8.875in}
-\setlength{\textwidth}{6.875in}
-\setlength{\columnsep}{0.3125in}
-\setlength{\topmargin}{0in}
-\setlength{\headheight}{0in}
-\setlength{\headsep}{0in}
-\setlength{\parindent}{1pc}
-\setlength{\oddsidemargin}{-.304in}
-\setlength{\evensidemargin}{-.304in}
-
-% memento from size10.clo
-% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
-% \small{\@setfontsize\small\@ixpt{11}}
-% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
-% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
-% \tiny{\@setfontsize\tiny\@vpt\@vipt}
-% \large{\@setfontsize\large\@xiipt{14}}
-% \Large{\@setfontsize\Large\@xivpt{18}}
-% \LARGE{\@setfontsize\LARGE\@xviipt{22}}
-% \huge{\@setfontsize\huge\@xxpt{25}}
-% \Huge{\@setfontsize\Huge\@xxvpt{30}}
-
-\def\@maketitle
-   {
-   \newpage
-   \null
-   \vskip .375in
-   \begin{center}
-      {\Large \bf \@title \par}
-      % additional two empty lines at the end of the title
-      \vspace*{24pt}
-      {
-      \large
-      \lineskip .5em
-      \begin{tabular}[t]{c}
-         \@author
-      \end{tabular}
-      \par
-      }
-      % additional small space at the end of the author name
-      \vskip .5em
-      {
-       \large
-      \begin{tabular}[t]{c}
-         \@affiliation
-      \end{tabular}
-      \par
-      \ifx \@empty \@email
-      \else
-         \begin{tabular}{r@{~}l}
-            E-mail: & {\tt \@email}
-         \end{tabular}
-         \par
-      \fi
-      }
-      % additional empty line at the end of the title block
-      \vspace*{12pt}
-   \end{center}
-   }
-
-\def\abstract
-   {%
-   \centerline{\large\bf Abstract}%
-   \vspace*{12pt}%
-%  \it%  %%%% iroro - commenting out italicized abstract
-   }
-
-\def\endabstract
-   {
-   % additional empty line at the end of the abstract
-   \vspace*{12pt}
-   }
-
-\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
-
-\def\email#1{\gdef\@email{#1}}
-\gdef\@email{}
-
-\newlength{\@ctmp}
-\newlength{\@figindent}
-\setlength{\@figindent}{1pc}
-
-\long\def\@makecaption#1#2{
-   \vskip 10pt
-   \setbox\@tempboxa\hbox{\tenhv\noindent #1.~#2}
-   \setlength{\@ctmp}{\hsize}
-   \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent}
-   % IF longer than one indented paragraph line
-   \ifdim \wd\@tempboxa >\@ctmp
-      % THEN set as an indented paragraph
-      \begin{list}{}{\leftmargin\@figindent \rightmargin\leftmargin}
-         \item[]\tenhv #1.~#2\par
-      \end{list}
-   \else
-      % ELSE center
-      \hbox to\hsize{\hfil\box\@tempboxa\hfil}
-   \fi}
-
-% correct heading spacing and type
-\def\section{\@startsection {section}{1}{\z@}
-   {14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}}
-\def\subsection{\@startsection {subsection}{2}{\z@}
-   {13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}}
-
-% add the period after section numbers
-\newcommand{\Section}[1]{\section{\hskip -1em.~#1}}
-\newcommand{\SubSection}[1]{\subsection{\hskip -1em.~#1}}
-
-% end of file latex8.sty
-% ---------------------------------------------------------------
diff --git a/docs/TR-20210730212057-IA-WDS-CG/.gitignore b/docs/TR-20210730212057-IA-WDS-CG/.gitignore
deleted file mode 100644
index 5040d53..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-*.log
-*.aux
-*.bbl
-*.blg
-*.out
diff --git a/docs/TR-20210730212057-IA-WDS-CG/Makefile b/docs/TR-20210730212057-IA-WDS-CG/Makefile
deleted file mode 100644
index 9996575..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-main.pdf: main.tex
-	pdflatex main.tex
-	bibtex main
-	pdflatex main.tex
-
-
-.PHONY: clean
-clean:
-	rm -f main.pdf
diff --git a/docs/TR-20210730212057-IA-WDS-CG/README.md b/docs/TR-20210730212057-IA-WDS-CG/README.md
deleted file mode 100644
index 54de590..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-
-## Description:
-
-The project hosts an aesthetic and simple LaTeX style suitable for "preprint" publications such as arXiv and bio-arXiv, etc. 
-It is based on the [**nips_2018.sty**](https://media.nips.cc/Conferences/NIPS2018/Styles/nips_2018.sty) style.
-
-This styling maintains the esthetic of NIPS but adding and changing features to make it (IMO) even better and more suitable for preprints.
-The result looks fairly different from NIPS style so that readers won't get confused to think that the preprint was published in NIPS. 
-
-### Why NIPS? 
-Because the NIPS styling is a comfortable single column format that is very esthetic and convenient for reading.
-
-## Usage:
-1. Use Document class **article**. 
-2. Copy **arxiv.sty** to the folder containing your tex file.
-3. add `\usepackage{arxiv}` after `\documentclass{article}`.
-4. The only packages used in the style file are **geometry** and **fancyheader**. Do not reimport them.
-
-See **template.tex** 
-
-## Project files:
-1. **arxiv.sty** - the style file.
-2. **template.tex** - a sample template that uses the **arxiv style**.
-3. **references.bib** - the bibliography source file for template.tex.
-4. **template.pdf** - a sample output of the template file that demonstrated the design provided by the arxiv style.
-
-
-## Handling References when submitting to arXiv.org
-The most convenient way to manage references is using an external BibTeX file and pointing to it from the main file. 
-However, this requires running the [bibtex](http://www.bibtex.org/) tool to "compile" the `.bib` file and create `.bbl` file containing "bibitems" that can be directly inserted in the main tex file. 
-However, unfortunately the arXiv Tex environment ([Tex Live](https://www.tug.org/texlive/)) do not do that. 
-So easiest way when submitting to arXiv is to create a single self-contained .tex file that contains the references.
-This can be done by running the BibTeX command on your machine and insert the content of the generated `.bbl` file into the `.tex` file and commenting out the `\bibliography{references}` that point to the external references file.
-
-Below are the commands that should be run in the project folder:
-1. Run `$ latex template`
-2. Run `$ bibtex template`
-3. A `template.bbl` file will be generated (make sure it is there)
-4. Copy the `template.bbl` file content to `template.tex` into the `\begin{thebibliography}` command.
-5. Comment out the `\bibliography{references}` command in `template.tex`.
-6. You ready to submit to arXiv.org.
-
-
-## General Notes:
-1. For help, comments, praises, bug reporting or change requests, you can contact the author at: kourgeorge/at/gmail.com.
-2. You can use, redistribute and do whatever with this project, however, the author takes no responsibility on whatever usage of this project.
-3. If you start another project based on this project, it would be nice to mention/link to this project.
-4. You are very welcome to contribute to this project.
-5. A good looking 2 column template can be found in https://github.com/brenhinkeller/preprint-template.tex.
diff --git a/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty b/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty
deleted file mode 100644
index ccb7feb..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty
+++ /dev/null
@@ -1,262 +0,0 @@
-\NeedsTeXFormat{LaTeX2e}
-
-\ProcessOptions\relax
-
-% fonts
-\renewcommand{\rmdefault}{ptm}
-\renewcommand{\sfdefault}{phv}
-
-% set page geometry
-\usepackage[verbose=true,letterpaper]{geometry}
-\AtBeginDocument{
-  \newgeometry{
-    textheight=9in,
-    textwidth=6.5in,
-    top=1in,
-    headheight=14pt,
-    headsep=25pt,
-    footskip=30pt
-  }
-}
-
-\widowpenalty=10000
-\clubpenalty=10000
-\flushbottom
-\sloppy
-
-
-
-\newcommand{\headeright}{A Preprint}
-\newcommand{\undertitle}{A Preprint}
-\newcommand{\shorttitle}{\@title}
-
-\usepackage{fancyhdr}
-\fancyhf{}
-\pagestyle{fancy}
-\renewcommand{\headrulewidth}{0.4pt}
-\fancyheadoffset{0pt}
-\rhead{\scshape \footnotesize \headeright}
-\chead{\shorttitle}
-\cfoot{\thepage}
-
-
-%Handling Keywords
-\def\keywordname{{\bfseries \emph{Keywords}}}%
-\def\keywords#1{\par\addvspace\medskipamount{\rightskip=0pt plus1cm
-\def\and{\ifhmode\unskip\nobreak\fi\ $\cdot$
-}\noindent\keywordname\enspace\ignorespaces#1\par}}
-
-% font sizes with reduced leading
-\renewcommand{\normalsize}{%
-  \@setfontsize\normalsize\@xpt\@xipt
-  \abovedisplayskip      7\p@ \@plus 2\p@ \@minus 5\p@
-  \abovedisplayshortskip \z@ \@plus 3\p@
-  \belowdisplayskip      \abovedisplayskip
-  \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@
-}
-\normalsize
-\renewcommand{\small}{%
-  \@setfontsize\small\@ixpt\@xpt
-  \abovedisplayskip      6\p@ \@plus 1.5\p@ \@minus 4\p@
-  \abovedisplayshortskip \z@  \@plus 2\p@
-  \belowdisplayskip      \abovedisplayskip
-  \belowdisplayshortskip 3\p@ \@plus 2\p@   \@minus 2\p@
-}
-\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt}
-\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt}
-\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt}
-\renewcommand{\large}{\@setfontsize\large\@xiipt{14}}
-\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}}
-\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}}
-\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}}
-\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}}
-
-% sections with less space
-\providecommand{\section}{}
-\renewcommand{\section}{%
-  \@startsection{section}{1}{\z@}%
-                {-2.0ex \@plus -0.5ex \@minus -0.2ex}%
-                { 1.5ex \@plus  0.3ex \@minus  0.2ex}%
-                {\large\bf\raggedright}%
-}
-\providecommand{\subsection}{}
-\renewcommand{\subsection}{%
-  \@startsection{subsection}{2}{\z@}%
-                {-1.8ex \@plus -0.5ex \@minus -0.2ex}%
-                { 0.8ex \@plus  0.2ex}%
-                {\normalsize\bf\raggedright}%
-}
-\providecommand{\subsubsection}{}
-\renewcommand{\subsubsection}{%
-  \@startsection{subsubsection}{3}{\z@}%
-                {-1.5ex \@plus -0.5ex \@minus -0.2ex}%
-                { 0.5ex \@plus  0.2ex}%
-                {\normalsize\bf\raggedright}%
-}
-\providecommand{\paragraph}{}
-\renewcommand{\paragraph}{%
-  \@startsection{paragraph}{4}{\z@}%
-                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
-                {-1em}%
-                {\normalsize\bf}%
-}
-\providecommand{\subparagraph}{}
-\renewcommand{\subparagraph}{%
-  \@startsection{subparagraph}{5}{\z@}%
-                {1.5ex \@plus 0.5ex \@minus 0.2ex}%
-                {-1em}%
-                {\normalsize\bf}%
-}
-\providecommand{\subsubsubsection}{}
-\renewcommand{\subsubsubsection}{%
-  \vskip5pt{\noindent\normalsize\rm\raggedright}%
-}
-
-% float placement
-\renewcommand{\topfraction      }{0.85}
-\renewcommand{\bottomfraction   }{0.4}
-\renewcommand{\textfraction     }{0.1}
-\renewcommand{\floatpagefraction}{0.7}
-
-\newlength{\@abovecaptionskip}\setlength{\@abovecaptionskip}{7\p@}
-\newlength{\@belowcaptionskip}\setlength{\@belowcaptionskip}{\z@}
-
-\setlength{\abovecaptionskip}{\@abovecaptionskip}
-\setlength{\belowcaptionskip}{\@belowcaptionskip}
-
-% swap above/belowcaptionskip lengths for tables
-\renewenvironment{table}
-  {\setlength{\abovecaptionskip}{\@belowcaptionskip}%
-   \setlength{\belowcaptionskip}{\@abovecaptionskip}%
-   \@float{table}}
-  {\end@float}
-
-% footnote formatting
-\setlength{\footnotesep }{6.65\p@}
-\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@}
-\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@}
-\setcounter{footnote}{0}
-
-% paragraph formatting
-\setlength{\parindent}{\z@}
-\setlength{\parskip  }{5.5\p@}
-
-% list formatting
-\setlength{\topsep       }{4\p@ \@plus 1\p@   \@minus 2\p@}
-\setlength{\partopsep    }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@}
-\setlength{\itemsep      }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
-\setlength{\parsep       }{2\p@ \@plus 1\p@   \@minus 0.5\p@}
-\setlength{\leftmargin   }{3pc}
-\setlength{\leftmargini  }{\leftmargin}
-\setlength{\leftmarginii }{2em}
-\setlength{\leftmarginiii}{1.5em}
-\setlength{\leftmarginiv }{1.0em}
-\setlength{\leftmarginv  }{0.5em}
-\def\@listi  {\leftmargin\leftmargini}
-\def\@listii {\leftmargin\leftmarginii
-              \labelwidth\leftmarginii
-              \advance\labelwidth-\labelsep
-              \topsep  2\p@ \@plus 1\p@    \@minus 0.5\p@
-              \parsep  1\p@ \@plus 0.5\p@ \@minus 0.5\p@
-              \itemsep \parsep}
-\def\@listiii{\leftmargin\leftmarginiii
-              \labelwidth\leftmarginiii
-              \advance\labelwidth-\labelsep
-              \topsep    1\p@ \@plus 0.5\p@ \@minus 0.5\p@
-              \parsep    \z@
-              \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@
-              \itemsep \topsep}
-\def\@listiv {\leftmargin\leftmarginiv
-              \labelwidth\leftmarginiv
-              \advance\labelwidth-\labelsep}
-\def\@listv  {\leftmargin\leftmarginv
-              \labelwidth\leftmarginv
-              \advance\labelwidth-\labelsep}
-\def\@listvi {\leftmargin\leftmarginvi
-              \labelwidth\leftmarginvi
-              \advance\labelwidth-\labelsep}
-
-% create title
-\providecommand{\maketitle}{}
-\renewcommand{\maketitle}{%
-  \par
-  \begingroup
-    \renewcommand{\thefootnote}{\fnsymbol{footnote}}
-    % for perfect author name centering
-    \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}}
-    % The footnote-mark was overlapping the footnote-text,
-    % added the following to fix this problem               (MK)
-    \long\def\@makefntext##1{%
-      \parindent 1em\noindent
-      \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1
-    }
-    \thispagestyle{empty}
-    \@maketitle
-    \@thanks
-    %\@notice
-  \endgroup
-  \let\maketitle\relax
-  \let\thanks\relax
-}
-
-% rules for title box at top of first page
-\newcommand{\@toptitlebar}{
-  \hrule height 2\p@
-  \vskip 0.25in
-  \vskip -\parskip%
-}
-\newcommand{\@bottomtitlebar}{
-  \vskip 0.29in
-  \vskip -\parskip
-  \hrule height 2\p@
-  \vskip 0.09in%
-}
-
-% create title (includes both anonymized and non-anonymized versions)
-\providecommand{\@maketitle}{}
-\renewcommand{\@maketitle}{%
-  \vbox{%
-    \hsize\textwidth
-    \linewidth\hsize
-    \vskip 0.1in
-    \@toptitlebar
-    \centering
-    {\LARGE\sc \@title\par}
-    \@bottomtitlebar
-    \textsc{\undertitle}\\
-    \vskip 0.1in
-    \def\And{%
-      \end{tabular}\hfil\linebreak[0]\hfil%
-      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
-    }
-    \def\AND{%
-      \end{tabular}\hfil\linebreak[4]\hfil%
-      \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces%
-    }
-    \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}%
-  \vskip 0.4in \@minus 0.1in \center{\@date}   \vskip 0.2in
-  }
-}
-
-% add conference notice to bottom of first page
-\newcommand{\ftype@noticebox}{8}
-\newcommand{\@notice}{%
-  % give a bit of extra room back to authors on first page
-  \enlargethispage{2\baselineskip}%
-  \@float{noticebox}[b]%
-    \footnotesize\@noticestring%
-  \end@float%
-}
-
-% abstract styling
-\renewenvironment{abstract}
-{
-  \centerline
-  {\large \bfseries \scshape Abstract}
-  \begin{quote}
-}
-{
-  \end{quote}
-}
-
-\endinput
diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.pdf b/docs/TR-20210730212057-IA-WDS-CG/main.pdf
deleted file mode 100644
index c8bb5a3..0000000
Binary files a/docs/TR-20210730212057-IA-WDS-CG/main.pdf and /dev/null differ
diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex
deleted file mode 100644
index a7edac3..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/main.tex
+++ /dev/null
@@ -1,442 +0,0 @@
-\documentclass{article}
-
-
-
-\usepackage{arxiv}
-
-\usepackage[utf8]{inputenc} % allow utf-8 input
-\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
-\usepackage{hyperref}       % hyperlinks
-\usepackage{url}            % simple URL typesetting
-\usepackage{booktabs}       % professional-quality tables
-\usepackage{amsfonts}       % blackboard math symbols
-\usepackage{nicefrac}       % compact symbols for 1/2, etc.
-\usepackage{microtype}      % microtypography
-\usepackage{lipsum}		% Can be removed after putting your text content
-\usepackage{graphicx}
-\usepackage{natbib}
-\usepackage{doi}
-
-\title{Internet Archive Scholar Citation Graph Dataset}
-
-\date{August 10, 2021}	% Here you can change the date presented in the paper title
-%\date{} 					% Or removing it
-
-\author{ Martin Czygan \\
-	Internet Archive\\
-	San Francisco, CA 94118 \\
-	\texttt{martin@archive.org} \\
-	%% examples of more authors
-	\And
-	Bryan Newbold \\
-	Internet Archive\\
-	San Francisco, CA 94118 \\
-	\texttt{bnewbold@archive.org} \\
-	% \And
-	% Helge Holzmann \\
-	% Internet Archive\\
-	% San Francisco, CA 94118 \\
-	% \texttt{helge@archive.org} \\
-	% \And
-	% Jefferson Bailey \\
-	% Internet Archive\\
-	% San Francisco, CA 94118 \\
-	% \texttt{jefferson@archive.org} \\
-	%% \AND
-	%% Coauthor \\
-	%% Affiliation \\
-	%% Address \\
-	%% \texttt{email} \\
-	%% \And
-	%% Coauthor \\
-	%% Affiliation \\
-	%% Address \\
-	%% \texttt{email} \\
-	%% \And
-	%% Coauthor \\
-	%% Affiliation \\
-	%% Address \\
-	%% \texttt{email} \\
-}
-
-% Uncomment to remove the date
-%\date{}
-
-% Uncomment to override  the `A preprint' in the header
-\renewcommand{\headeright}{Technical Report}
-\renewcommand{\undertitle}{Technical Report}
-% \renewcommand{\shorttitle}{\textit{arXiv} Template}
-
-%%% Add PDF metadata to help others organize their library
-%%% Once the PDF is generated, you can check the metadata with
-%%% $ pdfinfo template.pdf
-\hypersetup{
-pdftitle={Internet Archive Scholar Citation Graph Dataset},
-pdfsubject={cs.DL, cs.IR},
-pdfauthor={Martin Czygan, Bryan Newbold, Helge Holzmann, Jefferson Bailey},
-pdfkeywords={Web Archiving, Citation Graph},
-}
-
-\begin{document}
-\maketitle
-
-\begin{abstract}
-As part of its scholarly data efforts, the Internet Archive releases a citation
-graph dataset derived from scholarly publications and additional data sources. It is
-composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related
-web-scale crawls targeting primary and secondary scholarly outputs. In
-addition, relations are worked out between scholarly publications, web pages
-and their archived copies, books from the Open Library project as well as
-Wikipedia articles. This first version of the graph consists of over X nodes
-and over Y edges. We release this dataset under a Z open license under the
-collection at \href{https://archive.org/details/TODO-citation\_graph}{https://archive.org/details/TODO-citation\_graph}, as well as all code
-used for derivation under an MIT license.
-\end{abstract}
-
-
-% keywords can be removed
-\keywords{Citation Graph \and Scholarly Communications \and Web Archiving}
-
-
-\section{Introduction}
-
-The Internet Archive releases a first version of a citation graph dataset
-derived from a raw corpus of about 2.5B references gathered from metadata and
-from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}.
-The goal of this report is to describe briefly the current contents and the
-derivation of the Archive Scholar Citations Dataset (ASC). We expect
-this dataset to be iterated upon, with changes both in content and processing.
-
-Modern citation indexes can be traced back to the early computing age, when
-projects like the Science Citation Index (1955)\citep{garfield2007evolution}
-were first devised, living on in existing commercial knowledge bases today.
-Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
-- the first version of which contained 6,325,178 individual
-references\citep{shotton2013publishing}. Other notable sources from that time
-include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
-decade has seen an increase of more openly available reference dataset and
-citation projects, like Microsoft Academic\citep{sinha2015overview} and
-Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
-according to \citep{hutchins2021tipping} over 1B citations are publicly
-available, marking a tipping point for open citations.
-
-
-
-\section{Citation Graph Contents}
-
-
-
-% * edges
-% * edges exact
-% * edges fuzzy
-% * edges fuzzy reason (table)
-% * number of source docs
-% * number of target docs
-% * refs to papers
-% * refs to books
-% * refs to web pages
-% * refs to web pages that have been archived
-% * refs to web pages that have been archived but not on liveweb any more
-%
-% Overlaps
-%
-% * how many edges can be found in COCI as well
-% * how many edges can be found in MAG as well
-% * how many unique to us edges
-%
-% Additional numbers
-%
-% * number of unparsed refs
-% * "biblio" field distribution of unparted refs
-%
-% Potential routes
-%
-% * journal abbreviation parsing with suffix arrays
-% * lookup by name, year and journal
-
-
-\section{System Design}
-
-The constraints for the systems design are informed by the volume and the
-variety of the data. In total, the raw inputs amount to a few TB of textual
-content, mostly newline delimited JSON. More importantly, while the number of
-data fields is low, certain schemas are very partial with hundreds of different
-combinations of available field values found in the raw reference data. This is
-most likely caused by aggregators passing on reference data coming from
-hundreds of sources, each of which not necessarily agreeing on a common
-granularity for citation data and from artifacts of machine learning based
-structured data extraction tools.
-
-Each combination of fields may require a slightly different processing path.
-For example, references with an Arxiv identifier can be processed differently
-from references with only a title. Over 50\% of the raw reference data comes
-from a set of eight field manifestations, as listed in
-Table~\ref{table:fields}.
-
-\begin{table}[]
-    \begin{center}
-    \begin{tabular}{ll}
-\toprule
-        \bf{Fields}                                    & \bf{Share} \\
-\midrule
-        \multicolumn{1}{l}{CN|CRN|P|T|U|V|Y}    & 14\%                              \\
-        \multicolumn{1}{l}{DOI}                 & 14\%                              \\
-        \multicolumn{1}{l}{CN|CRN|IS|P|T|U|V|Y} & 5\%                               \\
-        \multicolumn{1}{l}{CN|CRN|DOI|U|V|Y}    & 4\%                               \\
-        \multicolumn{1}{l}{PMID|U}              & 4\%                               \\
-        \multicolumn{1}{l}{CN|CRN|DOI|T|V|Y}    & 4\%                               \\
-        \multicolumn{1}{l}{CN|CRN|Y}            & 4\%                               \\
-        \multicolumn{1}{l}{CN|CRN|DOI|V|Y}      & 4\%                               \\
-    \end{tabular}
-    \vspace*{2mm}
-    \caption{Top 8 combinations of available fields in raw reference data
-        accounting for about 53\% of the total data (CN = container name, CRN =
-contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
-issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value.}
-    \label{table:fields}
-\end{center}
-\end{table}
-
-Overall, a map-reduce style approach is followed, which allows for some
-uniformity in the overall processing. We extract (key, document) tuples (as
-TSV) from the raw JSON data and sort by key. Then we group documents with the
-same key into groups and apply a function on each group in order to generate
-our target schema (currently named biblioref, or bref for short) or perform
-addition operations (such as deduplication).
-
-The key derivation can be exact (like an identifier like DOI, PMID, etc) or
-based on a normalization procedure, like a slugified title string. For
-identifier based matches we can generate the target biblioref schema directly.
-For fuzzy matching candidates, we pass possible match pairs through a
-verification procedure, which is implemented for release entity schema pairs.
-The current verification procedure is a domain dependent rule based
-verification, able to identify different versions of a publication,
-preprint-published pairs or or other kind of similar documents by calculating
-similarity metrics across title and authors. The fuzzy matching approach is
-applied on all reference documents, which only have a title, but no identifier.
-
-With a few schema conversions, fuzzy matching can be applied to Wikipedia
-articles and Open Library (edition) records as well. The aspect of precision
-and recall are represented by the two stages: we are generous in the match
-candidate generation phase in order to improve recall, but we are strict during
-verification, in order to control precision.
-
-\section{Fuzzy Matching Approach}
-
-% Take sample of 100 docs, report some precision, recall, F1 on a hand curated
-% small subset.
-
-The fuzzy matching approach currently implemented works in two phases: match
-candidate generation and verification. For candidate generation, we map each
-document to a key. We implemented a number of algorithms to form these
-clusters, e.g. title normalizations (including lowercasing, whitespace removal,
-unicode normalization and other measures) or transformations like
-NYSIIS\citep{silbert1970world}.
-
-The verification approach is based on a set of rules, which are tested
-sequentially, yielding a match signal from weak to exact. We use a suite of
-over 300 manually curated match examples\footnote{The table can be found here:
-\href{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}}
-as part of a unit test suite to allow for a controlled, continuous adjustement
-to the verification procedure. If the verification yields either an exact or
-strong signal, we include consider it a match.
-
-We try to keep the processing steps performant to keep the overall derivation
-time limited. Map and reduce operations are parallelized and certain processing
-steps can process 100K documents per second or even more on commodity hardware
-with spinning disks.
-
-\section{Quality Assurance}
-
-Understanding data quality plays a role, as the data is coming from a myriad of
-sources, each with possible idiosyncratic features or missing values. We employ
-a few QA measures during the process. First, we try to pass each data item
-through only one processing pipeline (e.g. items matched by any identifier
-should not even be considered for fuzzy matching). If duplicate links appear in
-the final dataset nonetheless, we remove them, prefering exact over fuzzy matches.
-
-We employ a couple of data cleaning techniques, e.g. to find and verify
-identifiers like ISBN or to sanitize URLs found in the data. Many of these
-artifacts stem from the fact that large chunks of the raw data come from
-heuristic data extraction from PDF documents.
-
-
-\section{Discussion}
-
-% need to iterate
-
-%\lipsum[2] %\lipsum[3]
-
-
-% \section{Headings: first level} % \label{sec:headings}
-%
-% \lipsum[4] See Section \ref{sec:headings}.
-%
-% \subsection{Headings: second level}
-% \lipsum[5]
-% \begin{equation}
-% 	\xi _{ij}(t)=P(x_{t}=i,x_{t+1}=j|y,v,w;\theta)= {\frac {\alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}{\sum _{i=1}^{N} \sum _{j=1}^{N} \alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}}
-% \end{equation}
-%
-% \subsubsection{Headings: third level}
-% \lipsum[6]
-%
-% \paragraph{Paragraph}
-% \lipsum[7]
-%
-%
-%
-% \section{Examples of citations, figures, tables, references}
-% \label{sec:others}
-%
-% \subsection{Citations}
-% Citations use \verb+natbib+. The documentation may be found at
-% \begin{center}
-% 	\url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf}
-% \end{center}
-%
-% Here is an example usage of the two main commands (\verb+citet+ and \verb+citep+): Some people thought a thing \citep{kour2014real, hadash2018estimate} but other people thought something else \citep{kour2014fast}. Many people have speculated that if we knew exactly why \citet{kour2014fast} thought this\dots
-%
-% \subsection{Figures}
-% \lipsum[10]
-% See Figure \ref{fig:fig1}. Here is how you add footnotes. \footnote{Sample of the first footnote.}
-% \lipsum[11]
-%
-% \begin{figure}
-% 	\centering
-% 	\fbox{\rule[-.5cm]{4cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
-% 	\caption{Sample figure caption.}
-% 	\label{fig:fig1}
-% \end{figure}
-%
-% \subsection{Tables}
-% See awesome Table~\ref{tab:table}.
-%
-% The documentation for \verb+booktabs+ (`Publication quality tables in LaTeX') is available from:
-% \begin{center}
-% 	\url{https://www.ctan.org/pkg/booktabs}
-% \end{center}
-%
-%
-% \begin{table}
-% 	\caption{Sample table title}
-% 	\centering
-% 	\begin{tabular}{lll}
-% 		\toprule
-% 		\multicolumn{2}{c}{Part}                   \\
-% 		\cmidrule(r){1-2}
-% 		Name     & Description     & Size ($\mu$m) \\
-% 		\midrule
-% 		Dendrite & Input terminal  & $\sim$100     \\
-% 		Axon     & Output terminal & $\sim$10      \\
-% 		Soma     & Cell body       & up to $10^6$  \\
-% 		\bottomrule
-% 	\end{tabular}
-% 	\label{tab:table}
-% \end{table}
-%
-% \subsection{Lists}
-% \begin{itemize}
-% 	\item Lorem ipsum dolor sit amet
-% 	\item consectetur adipiscing elit.
-% 	\item Aliquam dignissim blandit est, in dictum tortor gravida eget. In ac rutrum magna.
-% \end{itemize}
-
-
-\bibliographystyle{unsrtnat}
-\bibliography{references}  %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) .
-
-
-%%% Uncomment this section and comment out the \bibliography{references} line above to use inline references.
-% \begin{thebibliography}{1}
-
-% 	\bibitem{kour2014real}
-% 	George Kour and Raid Saabne.
-% 	\newblock Real-time segmentation of on-line handwritten arabic script.
-% 	\newblock In {\em Frontiers in Handwriting Recognition (ICFHR), 2014 14th
-% 			International Conference on}, pages 417--422. IEEE, 2014.
-
-% 	\bibitem{kour2014fast}
-% 	George Kour and Raid Saabne.
-% 	\newblock Fast classification of handwritten on-line arabic characters.
-% 	\newblock In {\em Soft Computing and Pattern Recognition (SoCPaR), 2014 6th
-% 			International Conference of}, pages 312--318. IEEE, 2014.
-
-% 	\bibitem{hadash2018estimate}
-% 	Guy Hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George Kour, and Alon
-% 	Jacovi.
-% 	\newblock Estimate and replace: A novel approach to integrating deep neural
-% 	networks with existing applications.
-% 	\newblock {\em arXiv preprint arXiv:1804.09028}, 2018.
-
-% \end{thebibliography}
-
-\section{Appendix}
-
-% Please add the following required packages to your document preamble:
-\begin{table}[]
-    \begin{center}
-\begin{tabular}{@{}rlll@{}}
-\toprule
-\textbf{Number of matches} & \textbf{Citation Provenance} & \textbf{Match Status} & \textbf{Match Reason} \\ \midrule
-934932865                  & crossref                  & exact                 & doi                   \\
-151366108                  & fatcat-datacite           & exact                 & doi                   \\
-65345275                   & fatcat-pubmed             & exact                 & pmid                  \\
-48778607                   & fuzzy                     & strong                & jaccardauthors        \\
-42465250                   & grobid                    & exact                 & doi                   \\
-29197902                   & fatcat-pubmed             & exact                 & doi                   \\
-19996327                   & fatcat-crossref           & exact                 & doi                   \\
-11996694                   & fuzzy                     & strong                & slugtitleauthormatch  \\
-9157498                    & fuzzy                     & strong                & tokenizedauthors      \\
-3547594                    & grobid                    & exact                 & arxiv                 \\
-2310025                    & fuzzy                     & exact                 & titleauthormatch      \\
-1496515                    & grobid                    & exact                 & pmid                  \\
-680722                     & crossref                  & strong                & jaccardauthors        \\
-476331                     & fuzzy                     & strong                & versioneddoi          \\
-449271                     & grobid                    & exact                 & isbn                  \\
-230645                     & fatcat-crossref           & strong                & jaccardauthors        \\
-190578                     & grobid                    & strong                & jaccardauthors        \\
-156657                     & crossref                  & exact                 & isbn                  \\
-123681                     & fatcat-pubmed             & strong                & jaccardauthors        \\
-79328                      & crossref                  & exact                 & arxiv                 \\
-57414                      & crossref                  & strong                & tokenizedauthors      \\
-53480                      & fuzzy                     & strong                & pmiddoipair           \\
-52453                      & fuzzy                     & strong                & dataciterelatedid     \\
-47119                      & grobid                    & strong                & slugtitleauthormatch  \\
-36774                      & fuzzy                     & strong                & arxivversion          \\
-35311                      & fuzzy                     & strong                & customieeearxiv       \\
-33863                      & grobid                    & exact                 & pmcid                 \\
-23504                      & crossref                  & strong                & slugtitleauthormatch  \\
-22753                      & fatcat-crossref           & strong                & tokenizedauthors      \\
-17720                      & grobid                    & exact                 & titleauthormatch      \\
-14656                      & crossref                  & exact                 & titleauthormatch      \\
-14438                      & grobid                    & strong                & tokenizedauthors      \\
-7682                       & fatcat-crossref           & exact                 & arxiv                 \\
-5972                       & fatcat-crossref           & exact                 & isbn                  \\
-5525                       & fatcat-pubmed             & exact                 & arxiv                 \\
-4290                       & fatcat-pubmed             & strong                & tokenizedauthors      \\
-2745                       & fatcat-pubmed             & exact                 & isbn                  \\
-2342                       & fatcat-pubmed             & strong                & slugtitleauthormatch  \\
-2273                       & fatcat-crossref           & strong                & slugtitleauthormatch  \\
-1960                       & fuzzy                     & exact                 & workid                \\
-1150                       & fatcat-crossref           & exact                 & titleauthormatch      \\
-1041                       & fatcat-pubmed             & exact                 & titleauthormatch      \\
-895                        & fuzzy                     & strong                & figshareversion       \\
-317                        & fuzzy                     & strong                & titleartifact         \\
-82                         & grobid                    & strong                & titleartifact         \\
-33                         & crossref                  & strong                & titleartifact         \\
-5                          & fuzzy                     & strong                & custombsiundated      \\
-1                          & fuzzy                     & strong                & custombsisubdoc       \\
-1                          & fatcat                    & exact                 & doi                   \\ \bottomrule
-\end{tabular}
-    \vspace*{2mm}
-	\caption{Table of match counts, reference provenance, match status and
-match reason. The match reason identifier encode a specific rule in the domain
-dependent verification process and are included for completeness - we do not
-include the details of each rule in this report.}
-    \label{table:fields}
-\end{center}
-\end{table}
-
-
-\end{document}
diff --git a/docs/TR-20210730212057-IA-WDS-CG/references.bib b/docs/TR-20210730212057-IA-WDS-CG/references.bib
deleted file mode 100644
index bcb8a16..0000000
--- a/docs/TR-20210730212057-IA-WDS-CG/references.bib
+++ /dev/null
@@ -1,123 +0,0 @@
-@inproceedings{kour2014real,
-  title={Real-time segmentation of on-line handwritten arabic script},
-  author={Kour, George and Saabne, Raid},
-  booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on},
-  pages={417--422},
-  year={2014},
-  organization={IEEE}
-}
-
-@inproceedings{kour2014fast,
-  title={Fast classification of handwritten on-line Arabic characters},
-  author={Kour, George and Saabne, Raid},
-  booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of},
-  pages={312--318},
-  year={2014},
-  organization={IEEE},
-  doi={10.1109/SOCPAR.2014.7008025}
-}
-
-@article{hadash2018estimate,
-  title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications},
-  author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon},
-  journal={arXiv preprint arXiv:1804.09028},
-  year={2018}
-}
-
-@article{garfield1955citation,
-  title={Citation indexes for science},
-  author={Garfield, Eugene},
-  journal={Science},
-  volume={122},
-  number={3159},
-  pages={108--111},
-  year={1955},
-  publisher={JSTOR}
-}
-
-@inproceedings{lopez2009grobid,
-  title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications},
-  author={Lopez, Patrice},
-  booktitle={International conference on theory and practice of digital libraries},
-  pages={473--474},
-  year={2009},
-  organization={Springer}
-}
-
-@article{garfield2007evolution,
-  title={The evolution of the science citation index},
-  author={Garfield, Eugene},
-  journal={International microbiology},
-  volume={10},
-  number={1},
-  pages={65},
-  year={2007}
-}
-
-@article{shotton2013publishing,
-  title={Publishing: open citations},
-  author={Shotton, David},
-  journal={Nature News},
-  volume={502},
-  number={7471},
-  pages={295},
-  year={2013}
-}
-
-@misc{CitEc,
-  title = {Citations in Economics},
-  howpublished = {\url{https://citec.repec.org/}},
-  note = {Accessed: 2021-07-30}
-}
-
-@inproceedings{wu2019citeseerx,
-  title={CiteSeerX: 20 years of service to scholarly big data},
-  author={Wu, Jian and Kim, Kunho and Giles, C Lee},
-  booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse},
-  pages={1--4},
-  year={2019}
-}
-
-@inproceedings{sinha2015overview,
-  title={An overview of microsoft academic service (mas) and applications},
-  author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan},
-  booktitle={Proceedings of the 24th international conference on world wide web},
-  pages={243--246},
-  year={2015}
-}
-
-@misc{i4oc,
-	title = {Initiative for Open Citations},
-howpublished = {\url{https://i4oc.org/}},
-note = {Accessed: 2021-07-30}
-}
-
-@article{shotton2018funders,
-  title={Funders should mandate open citations.},
-  author={Shotton, David},
-  journal={Nature},
-  volume={553},
-  number={7686},
-  pages={129--130},
-  year={2018},
-  publisher={Nature Publishing Group}
-}
-
-@article{hutchins2021tipping,
-  title={A tipping point for open citation data},
-  author={Hutchins, B Ian},
-  journal={Quantitative Science Studies},
-  pages={1--5},
-  year={2021}
-}
-
-@article{silbert1970world,
-  title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)},
-  author={Silbert, Jeffrey M},
-  journal={Criminology},
-  volume={8},
-  pages={107},
-  year={1970},
-  publisher={HeinOnline}
-}
-
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore
new file mode 100644
index 0000000..5040d53
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore
@@ -0,0 +1,5 @@
+*.log
+*.aux
+*.bbl
+*.blg
+*.out
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE
new file mode 100644
index 0000000..9f5c70f
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Ruoho Ruotsi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile
new file mode 100644
index 0000000..11264f8
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile
@@ -0,0 +1,17 @@
+main.pdf: main.tex
+	latexindent -w main.tex && rm -f main.bak*
+	pdflatex main.tex
+	bibtex main
+	pdflatex main.tex
+	pdflatex main.tex
+
+
+.PHONY: clean
+clean:
+	rm -f main.pdf
+	rm -f main.aux
+	rm -f main.log
+	rm -f main.bbl
+	rm -f main.blg
+	rm -f main.out
+
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/README.md b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md
new file mode 100644
index 0000000..3a56517
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md
@@ -0,0 +1,2 @@
+# latex-template-arxiv-preprint
+A simple LaTeX template for Technical Reports, arXiv preprints &amp; 2-column Conference papers
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf
new file mode 100644
index 0000000..b21876a
Binary files /dev/null and b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
new file mode 100644
index 0000000..3b431cc
Binary files /dev/null and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
new file mode 100644
index 0000000..e4febd9
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -0,0 +1,362 @@
+\documentclass[hidelinks,10pt,twocolumn]{article}
+\usepackage{simpleConference}
+\usepackage[utf8]{inputenc}
+\usepackage{times}
+\usepackage{graphicx}
+\usepackage{natbib}
+\usepackage{doi}
+\usepackage{amssymb}
+\usepackage{url,hyperref}
+\usepackage{booktabs}       % professional-quality tables
+\usepackage{amsfonts}       % blackboard math symbols
+\usepackage{nicefrac}       % compact symbols for 1/2, etc.
+\usepackage{caption}
+
+\usepackage{datetime}
+\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
+\setlength{\parindent}{0pt}
+
+\begin{document}
+
+\title{Fatcat Reference Dataset}
+
+\author{Martin Czygan \\
+	\\
+	Internet Archive \\
+	San Francisco, California, USA \\
+	martin@archive.org  \\
+	\and
+	Bryan Newbold \\
+	\\
+	Internet Archive \\
+	San Francisco, California, USA \\
+	bnewbold@archive.org  \\
+	\\
+}
+
+
+\maketitle
+\thispagestyle{empty}
+
+
+\begin{abstract}
+	As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
+	graph dataset, named \emph{refcat}, derived from scholarly publications and
+	additional data sources. It is composed of data gathered by the fatcat
+	cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
+	crawls targeting primary and secondary scholarly outputs, as well as metadata
+	from the Open Library\footnote{\url{https://openlibrary.org}} project and
+	Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
+	graph consists of 1,323,423,672 citations. We release this dataset under a CC0
+	Public Domain Dedication, accessible through an archive
+	item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
+	code used in the derivation process is released under an MIT
+	license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
+\end{abstract}
+
+\keywords{Citation Graph, Web Archiving}
+
+\section{Introduction}
+
+
+The Internet Archive releases a first version of a citation graph dataset
+derived from a raw corpus of about 2.5B references gathered from metadata and
+data obtained by PDF extraction tools such as
+GROBID\cite{lopez2009grobid}. Additionally, we consider integration with
+metadata from Open Library and Wikipedia.
+The goal of this report is to describe briefly the current contents and the
+derivation of the dataset. We expect
+this dataset to be iterated upon, with changes both in content and processing.
+
+Modern citation indexes can be traced back to the early computing age, when
+projects like the Science Citation Index (1955)\citep{garfield2007evolution}
+were first devised, living on in existing commercial knowledge bases today.
+Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
+- the first version of which contained 6,325,178 individual
+references\citep{shotton2013publishing}. Other notable early projects
+include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
+decade has seen the emergence of more openly available, large scale
+citation projects like Microsoft Academic\citep{sinha2015overview} or the
+Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
+according to \citep{hutchins2021tipping} over 1B citations are publicly
+available, marking a tipping point for this category of data.
+
+\section{Related Work}
+
+There are a few large scale citation dataset available today. COCI, the
+``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first
+released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on
+2021-07-29, it contains
+1,094,394,688 citations across 65,835,422 bibliographic
+resources\citep{peroni2020opencitations}.
+
+The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
+``a Wikimedia initiative to develop open citations and linked bibliographic
+data to serve free knowledge'' continously adds citations to its database and
+as of 2021-06-28 tracks 253,719,394 citations across 39,994,937
+publications\footnote{\url{http://wikicite.org/statistics.html}}.
+
+Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of
+entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
+with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
+	\url{https://archive.org/details/mag-2021-06-07}}  the
+\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
+bibliographic entities.
+
+Numerous other projects have been or are concerned with various aspects of
+citation discovery and curation as part their feature set, among them Semantic
+Scholar\citep{fricke2018semantic}, CiteSeerX\citep{li2006citeseerx} or Aminer\citep{tang2016aminer}.
+
+As mentioned in \citep{hutchins2021tipping}, the number of openly available
+citations is not expected to shrink in the future.
+
+
+\section{Dataset}
+
+We release the first version of the \emph{refcat} dataset in an format used
+internally for storage and to serve queries (and which we call \emph{biblioref}
+or \emph{bref} for short). The dataset includes metadata from fatcat, the
+Open Library Project and inbound links from the English Wikipedia. The fatcat
+project itself aggregates data from variety of open data sources, such as
+Crossref\citep{crossref}, PubMed\citep{canese2013pubmed},
+DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others,
+as well as metadata generated from analysis of data preserved at the Internet
+Archive and active crawls of publication sites on the web.
+
+The dataset is
+integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users
+to explore inbound and outbound references\cite{fatcatguidereferencegraph}.
+
+The format records source and target (fatcat release and work) identifiers, a
+few attributes from the metadata (such as year or release stage) as well as
+information about the match status and provanance.
+
+The dataset currently contains 1,323,423,672 citations across 76,327,662
+entities (55,123,635 unique source and 60,244,206 unique target work
+identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI
+for both source and target).
+The majority of matches - 1,250,523,321 - are established through identifier
+based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are
+established through fuzzy matching techniques.
+
+The majority of citations between \emph{refcat} and COCI overlap, as can be
+seen in~Table~\ref{table:cocicmp}.
+
+\begin{table}[]
+	\begin{center}
+		\begin{tabular}{ll}
+			\toprule
+			\bf{Set}              & \bf{Count}    \\
+
+			\midrule
+			COCI (C)              & 1,094,394,688 \\
+			\emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+			C $\cap$ R            & 1,007,539,966 \\
+			C $\setminus$ R       & 86,854,309    \\
+			R $\setminus$ C       & 295,884,246
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Comparison between COCI and \emph{refcat-doi}, a subset of
+			\emph{refcat} where entities have a known DOI. At least 50\% of the
+			295,884,246 references only in \emph{refcat-doi} come from links
+			recorded within a specific dataset provider (GBIF, DOI prefix:
+			10.15468).}
+		\label{table:cocicmp}
+	\end{center}
+\end{table}
+
+% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
+% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
+
+
+\section{System Design}
+
+The constraints for the systems design are informed by the volume and the
+variety of the data. The capability to run the whole graph derivation on a
+single machine was a minor goal as well. In total, the raw inputs amount to a
+few terabytes of textual content, mostly newline delimited JSON. More
+importantly, while the number of data fields is low, certain schemas are very
+partial with hundreds of different combinations of available field values found
+in the raw reference data. This is most likely caused by aggregators passing on
+reference data coming from hundreds of sources, each of which not necessarily
+agreeing on a common granularity for citation data and from artifacts of
+machine learning based structured data extraction tools.
+
+Each combination of fields may require a slightly different processing path.
+For example, references with an Arxiv identifier can be processed differently
+from references with only a title. Over 50\% of the raw reference data comes
+from a set of eight field set manifestations, as listed in
+Table~\ref{table:fields}.
+
+\begin{table}[]
+	\begin{center}
+		\begin{tabular}{ll}
+			\toprule
+			\bf{Fields}                                                                                     & \bf{Percentage} \\
+			\midrule
+			\multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}           & 14\%            \\
+			\multicolumn{1}{l}{\textbf{DOI}}                                                                & 14\%            \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}           & 4\%             \\
+			\multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}                                                     & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}           & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}                                                    & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}                     & 4\%             \\
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Top 8 combinations of available fields in raw reference data
+			accounting for about 53\% of the total data (CN = container name, CRN =
+			contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+			issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+		\label{table:fields}
+	\end{center}
+\end{table}
+
+Overall, a map-reduce style\citep{dean2010mapreduce} approach is
+followed\footnote{While the operations are similar, the processing is not
+	distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
+for some
+uniformity in the overall processing. We extract (key, document) tuples (as
+TSV) from the raw JSON data and sort by key. We then group documents with the
+same key and apply a function on each group in order to generate
+our target schema or perform
+additional operations such as deduplication or fusion of matched and unmatched references.
+
+The key derivation can be exact (via an identifier like DOI, PMID, etc) or
+based on a value normalization, like slugifying a title string. For identifier
+based matches we can generate the target schema directly.  For fuzzy matching
+candidates, we pass possible match pairs through a verification procedure,
+which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a
+domain dependent rule based verification, able to identify different versions
+of a publication, preprint-published pairs and documents, which are
+are similar by various metrics calculated over title and author fields. The fuzzy matching
+approach is applied on all reference documents without identifier (a title is
+currently required).
+
+With a few schema conversions, fuzzy matching can be applied to Wikipedia
+articles and Open Library (edition) records as well. The aspect of precision
+and recall are represented by the two stages: we are generous in the match
+candidate generation phase in order to improve recall, but we are strict during
+verification, in order to control precision. Quality assurance for verification is
+implemented through a growing list of test cases of real examples from the catalog and
+their expected or desired match status\footnote{The list can be found under:
+	\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
+	It is helpful to keep this test suite independent of any specific programming language.}.
+
+
+\section{Limitations and Future Work}
+
+As other dataset in this field we expect this dataset to be iterated upon.
+
+\begin{itemize}
+	\item The fatcat catalog updates its metadata
+	      continously\footnote{A changelog can currenly be followed here:
+		      \url{https://fatcat.wiki/changelog}} and web crawls are conducted
+	      regularly.  Current processing pipelines cover raw reference snapshot
+	      creation and derivation of the graph structure, which allows to rerun
+	      processing based on updated data as it becomes available.
+
+	\item Metadata extraction from PDFs depends on supervised machine learning
+	      models, which in turn depend on available training datasets. With additional crawls and
+	      metadata available we hope to improve models used for metadata
+	      extraction, improving yield and reducing data extraction artifacts in
+	      the process.
+
+	\item As of this version, a number of raw reference
+	      docs remain unmatched, which means that neither exact nor fuzzy matching
+	      has detected a link to a known entity. On the one
+	      hand, this can hint at missing metadata. However, parts of the data
+	      will contain a reference to a catalogued entity, but in a specific,
+	      dense and harder to recover form.
+	      This also include improvements to the fuzzy matching approach.
+	\item The reference dataset contains millions of URLs and their integration
+	      into the graph has been implemented as prototype. A full implementation
+	      requires a few data cleanup and normalization steps.
+\end{itemize}
+
+\section{Acknowledgements}
+
+This work is partially supported by a grant from the \emph{Andrew W. Mellon
+	Foundation}.
+
+
+\section{Appendix A}
+
+
+A note on data quality: While we implement various data quality measures,
+real-world data, especially coming from many different sources will contain
+issues. Among other measures, we keep track of match reasons,
+especially for fuzzy matching to be able to zoom in on systematic errors
+more easily (see~Table~\ref{table:matches}).
+
+\begin{table}[]
+	\footnotesize
+	\captionsetup{font=normalsize}
+	\begin{center}
+		\begin{tabular}{@{}rlll@{}}
+			\toprule
+			\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason}      \\ \midrule
+			934932865      & crossref            & exact           & doi                  \\
+			151366108      & fatcat-datacite     & exact           & doi                  \\
+			65345275       & fatcat-pubmed       & exact           & pmid                 \\
+			48778607       & fuzzy               & strong          & jaccardauthors       \\
+			42465250       & grobid              & exact           & doi                  \\
+			29197902       & fatcat-pubmed       & exact           & doi                  \\
+			19996327       & fatcat-crossref     & exact           & doi                  \\
+			11996694       & fuzzy               & strong          & slugtitleauthormatch \\
+			9157498        & fuzzy               & strong          & tokenizedauthors     \\
+			3547594        & grobid              & exact           & arxiv                \\
+			2310025        & fuzzy               & exact           & titleauthormatch     \\
+			1496515        & grobid              & exact           & pmid                 \\
+			680722         & crossref            & strong          & jaccardauthors       \\
+			476331         & fuzzy               & strong          & versioneddoi         \\
+			449271         & grobid              & exact           & isbn                 \\
+			230645         & fatcat-crossref     & strong          & jaccardauthors       \\
+			190578         & grobid              & strong          & jaccardauthors       \\
+			156657         & crossref            & exact           & isbn                 \\
+			123681         & fatcat-pubmed       & strong          & jaccardauthors       \\
+			79328          & crossref            & exact           & arxiv                \\
+			57414          & crossref            & strong          & tokenizedauthors     \\
+			53480          & fuzzy               & strong          & pmiddoipair          \\
+			52453          & fuzzy               & strong          & dataciterelatedid    \\
+			47119          & grobid              & strong          & slugtitleauthormatch \\
+			36774          & fuzzy               & strong          & arxivversion         \\
+			% 35311          & fuzzy               & strong          & customieeearxiv      \\
+			% 33863          & grobid              & exact           & pmcid                \\
+			% 23504          & crossref            & strong          & slugtitleauthormatch \\
+			% 22753          & fatcat-crossref     & strong          & tokenizedauthors     \\
+			% 17720          & grobid              & exact           & titleauthormatch     \\
+			% 14656          & crossref            & exact           & titleauthormatch     \\
+			% 14438          & grobid              & strong          & tokenizedauthors     \\
+			% 7682           & fatcat-crossref     & exact           & arxiv                \\
+			% 5972           & fatcat-crossref     & exact           & isbn                 \\
+			% 5525           & fatcat-pubmed       & exact           & arxiv                \\
+			% 4290           & fatcat-pubmed       & strong          & tokenizedauthors     \\
+			% 2745           & fatcat-pubmed       & exact           & isbn                 \\
+			% 2342           & fatcat-pubmed       & strong          & slugtitleauthormatch \\
+			% 2273           & fatcat-crossref     & strong          & slugtitleauthormatch \\
+			% 1960           & fuzzy               & exact           & workid               \\
+			% 1150           & fatcat-crossref     & exact           & titleauthormatch     \\
+			% 1041           & fatcat-pubmed       & exact           & titleauthormatch     \\
+			% 895            & fuzzy               & strong          & figshareversion      \\
+			% 317            & fuzzy               & strong          & titleartifact        \\
+			% 82             & grobid              & strong          & titleartifact        \\
+			% 33             & crossref            & strong          & titleartifact        \\
+			% 5              & fuzzy               & strong          & custombsiundated     \\
+			% 1              & fuzzy               & strong          & custombsisubdoc      \\
+			% 1              & fatcat              & exact           & doi                  \\ \bottomrule
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Table of match counts (top 25), reference provenance, match status and
+			match reason. The match reason identifier encode a specific rule in the domain
+			dependent verification process and are included for completeness - we do not
+			include the details of each rule in this report.}
+		\label{table:matches}
+	\end{center}
+\end{table}
+
+\bibliographystyle{abbrv}
+% \bibliographystyle{plainnat}
+\bibliography{refs}
+\end{document}
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
new file mode 100644
index 0000000..c61021e
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
@@ -0,0 +1,228 @@
+@inproceedings{kour2014real,
+  title={Real-time segmentation of on-line handwritten arabic script},
+  author={Kour, George and Saabne, Raid},
+  booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on},
+  pages={417--422},
+  year={2014},
+  organization={IEEE}
+}
+
+@inproceedings{kour2014fast,
+  title={Fast classification of handwritten on-line Arabic characters},
+  author={Kour, George and Saabne, Raid},
+  booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of},
+  pages={312--318},
+  year={2014},
+  organization={IEEE},
+  doi={10.1109/SOCPAR.2014.7008025}
+}
+
+@article{hadash2018estimate,
+  title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications},
+  author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon},
+  journal={arXiv preprint arXiv:1804.09028},
+  year={2018}
+}
+
+@article{garfield1955citation,
+  title={Citation indexes for science},
+  author={Garfield, Eugene},
+  journal={Science},
+  volume={122},
+  number={3159},
+  pages={108--111},
+  year={1955},
+  publisher={JSTOR}
+}
+
+@inproceedings{lopez2009grobid,
+  title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications},
+  author={Lopez, Patrice},
+  booktitle={International conference on theory and practice of digital libraries},
+  pages={473--474},
+  year={2009},
+  organization={Springer}
+}
+
+@article{garfield2007evolution,
+  title={The evolution of the science citation index},
+  author={Garfield, Eugene},
+  journal={International microbiology},
+  volume={10},
+  number={1},
+  pages={65},
+  year={2007}
+}
+
+@article{shotton2013publishing,
+  title={Publishing: open citations},
+  author={Shotton, David},
+  journal={Nature News},
+  volume={502},
+  number={7471},
+  pages={295},
+  year={2013}
+}
+
+@misc{CitEc,
+  title = {Citations in Economics},
+  howpublished = {\url{https://citec.repec.org/}},
+  note = {Accessed: 2021-07-30}
+}
+
+@inproceedings{wu2019citeseerx,
+  title={CiteSeerX: 20 years of service to scholarly big data},
+  author={Wu, Jian and Kim, Kunho and Giles, C Lee},
+  booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse},
+  pages={1--4},
+  year={2019}
+}
+
+@inproceedings{li2006citeseerx,
+  title={CiteSeerx: an architecture and web service design for an academic document search engine},
+  author={Li, Huajing and Councill, Isaac and Lee, Wang-Chien and Giles, C Lee},
+  booktitle={Proceedings of the 15th international conference on World Wide Web},
+  pages={883--884},
+  year={2006}
+}
+
+
+@inproceedings{sinha2015overview,
+  title={An overview of microsoft academic service (mas) and applications},
+  author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan},
+  booktitle={Proceedings of the 24th international conference on world wide web},
+  pages={243--246},
+  year={2015}
+}
+
+@misc{i4oc,
+	title = {Initiative for Open Citations},
+howpublished = {\url{https://i4oc.org/}},
+note = {Accessed: 2021-07-30}
+}
+
+@misc{fatcatguidereferencegraph,
+title = {The Fatcat Guide: Reference Graph (refcat)},
+howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}},
+note = {Accessed: 2021-08-08}
+}
+
+@misc{crossref,
+title = {Crossref},
+howpublished = {\url{https://crossref.org}},
+note = {Accessed: 2021-08-08}
+}
+
+@misc{doaj,
+title = {Directory of Open Access Journals},
+howpublished = {\url{https://doaj.org}},
+note = {Accessed: 2021-08-08}
+}
+
+@inproceedings{ley2002dblp,
+  title={The DBLP computer science bibliography: Evolution, research issues, perspectives},
+  author={Ley, Michael},
+  booktitle={International symposium on string processing and information retrieval},
+  pages={1--10},
+  year={2002},
+  organization={Springer}
+}
+
+
+@inproceedings{brase2009datacite,
+  title={DataCite-A global registration agency for research data},
+  author={Brase, Jan},
+  booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology},
+  pages={257--261},
+  year={2009},
+  organization={IEEE}
+}
+
+@article{canese2013pubmed,
+  title={PubMed: the bibliographic database},
+  author={Canese, Kathi and Weis, Sarah},
+  journal={The NCBI Handbook},
+  volume={2},
+  pages={1},
+  year={2013},
+  publisher={National Center for Biotechnology Information (US)}
+}
+
+
+@article{shotton2018funders,
+  title={Funders should mandate open citations.},
+  author={Shotton, David},
+  journal={Nature},
+  volume={553},
+  number={7686},
+  pages={129--130},
+  year={2018},
+  publisher={Nature Publishing Group}
+}
+
+@article{hutchins2021tipping,
+  title={A tipping point for open citation data},
+  author={Hutchins, B Ian},
+  journal={Quantitative Science Studies},
+  pages={1--5},
+  year={2021}
+}
+
+@article{silbert1970world,
+  title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)},
+  author={Silbert, Jeffrey M},
+  journal={Criminology},
+  volume={8},
+  pages={107},
+  year={1970},
+  publisher={HeinOnline}
+}
+
+@article{peroni2020opencitations,
+  title={OpenCitations, an infrastructure organization for open scholarship},
+  author={Peroni, Silvio and Shotton, David},
+  journal={Quantitative Science Studies},
+  volume={1},
+  number={1},
+  pages={428--444},
+  year={2020},
+  publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
+}
+
+@article{fricke2018semantic,
+  title={Semantic scholar},
+  author={Fricke, Suzanne},
+  journal={Journal of the Medical Library Association: JMLA},
+  volume={106},
+  number={1},
+  pages={145},
+  year={2018},
+  publisher={Medical Library Association}
+}
+
+@inproceedings{tang2016aminer,
+  title={AMiner: Toward understanding big scholar data},
+  author={Tang, Jie},
+  booktitle={Proceedings of the ninth ACM international conference on web search and data mining},
+  pages={467--467},
+  year={2016}
+}
+
+@article{dean2010mapreduce,
+  title={MapReduce: a flexible data processing tool},
+  author={Dean, Jeffrey and Ghemawat, Sanjay},
+  journal={Communications of the ACM},
+  volume={53},
+  number={1},
+  pages={72--77},
+  year={2010},
+  publisher={ACM New York, NY, USA}
+}
+
+@article{collet2018zstandard,
+  title={Zstandard Compression and the application/zstd Media Type},
+  author={Collet, Yann and Kucherawy, Murray},
+  journal={RFC 8478},
+  year={2018}
+}
+
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty
new file mode 100644
index 0000000..d4d4764
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty
@@ -0,0 +1,136 @@
+% ---------------------------------------------------------------
+% Style file for simple, two column conference papers.
+% Based on latex8.sty by Paolo.Ienne@di.epfl.ch
+% ---------------------------------------------------------------
+% Use with LaTeX2e as:
+%   \documentclass[times,10pt,twocolumn]{article}
+%   \usepackage{simpleConference}
+%   \usepackage{times}
+% ---------------------------------------------------------------
+% specify references as
+%   \bibliographystyle{simpleConference}
+%   \bibliography{...your files...}
+%
+% use Section{} and SubSection{} instead of standard section{}
+%    and subsection{} to obtain headings in the form
+%    "1.3. My heading"
+% ---------------------------------------------------------------
+% ten point helvetica bold required for captions
+% in some sites the name of the helvetica bold font may differ,
+% change the name here:
+\font\tenhv  = phvb at 10pt
+
+% eleven point times bold required for second-order headings
+\font\elvbf  = ptmb scaled 1100
+
+% set dimensions of columns, gap between columns, and paragraph indent
+\setlength{\textheight}{8.875in}
+\setlength{\textwidth}{6.875in}
+\setlength{\columnsep}{0.3125in}
+\setlength{\topmargin}{0in}
+\setlength{\headheight}{0in}
+\setlength{\headsep}{0in}
+\setlength{\parindent}{1pc}
+\setlength{\oddsidemargin}{-.304in}
+\setlength{\evensidemargin}{-.304in}
+
+% memento from size10.clo
+% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
+% \small{\@setfontsize\small\@ixpt{11}}
+% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
+% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
+% \tiny{\@setfontsize\tiny\@vpt\@vipt}
+% \large{\@setfontsize\large\@xiipt{14}}
+% \Large{\@setfontsize\Large\@xivpt{18}}
+% \LARGE{\@setfontsize\LARGE\@xviipt{22}}
+% \huge{\@setfontsize\huge\@xxpt{25}}
+% \Huge{\@setfontsize\Huge\@xxvpt{30}}
+
+\def\@maketitle
+   {
+   \newpage
+   \null
+   \vskip .375in
+   \begin{center}
+      {\Large \bf \@title \par}
+      % additional two empty lines at the end of the title
+      \vspace*{24pt}
+      {
+      \large
+      \lineskip .5em
+      \begin{tabular}[t]{c}
+         \@author
+      \end{tabular}
+      \par
+      }
+      % additional small space at the end of the author name
+      \vskip .5em
+      {
+       \large
+      \begin{tabular}[t]{c}
+         \@affiliation
+      \end{tabular}
+      \par
+      \ifx \@empty \@email
+      \else
+         \begin{tabular}{r@{~}l}
+            E-mail: & {\tt \@email}
+         \end{tabular}
+         \par
+      \fi
+      }
+      % additional empty line at the end of the title block
+      \vspace*{12pt}
+   \end{center}
+   }
+
+\def\abstract
+   {%
+   \centerline{\large\bf Abstract}%
+   \vspace*{12pt}%
+%  \it%  %%%% iroro - commenting out italicized abstract
+   }
+
+\def\endabstract
+   {
+   % additional empty line at the end of the abstract
+   \vspace*{12pt}
+   }
+
+\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
+
+\def\email#1{\gdef\@email{#1}}
+\gdef\@email{}
+
+\newlength{\@ctmp}
+\newlength{\@figindent}
+\setlength{\@figindent}{1pc}
+
+\long\def\@makecaption#1#2{
+   \vskip 10pt
+   \setbox\@tempboxa\hbox{\tenhv\noindent #1.~#2}
+   \setlength{\@ctmp}{\hsize}
+   \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent}
+   % IF longer than one indented paragraph line
+   \ifdim \wd\@tempboxa >\@ctmp
+      % THEN set as an indented paragraph
+      \begin{list}{}{\leftmargin\@figindent \rightmargin\leftmargin}
+         \item[]\tenhv #1.~#2\par
+      \end{list}
+   \else
+      % ELSE center
+      \hbox to\hsize{\hfil\box\@tempboxa\hfil}
+   \fi}
+
+% correct heading spacing and type
+\def\section{\@startsection {section}{1}{\z@}
+   {14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}}
+\def\subsection{\@startsection {subsection}{2}{\z@}
+   {13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}}
+
+% add the period after section numbers
+\newcommand{\Section}[1]{\section{\hskip -1em.~#1}}
+\newcommand{\SubSection}[1]{\subsection{\hskip -1em.~#1}}
+
+% end of file latex8.sty
+% ---------------------------------------------------------------
-- 
cgit v1.2.3