aboutsummaryrefslogtreecommitdiffstats
path: root/docs/TR-20210808100000-IA-WDS-REFCAT
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-08-08 15:18:29 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-08-08 15:18:29 +0200
commitbd66b58cded2c2c7e7b7e5d374434d6531dd70de (patch)
tree00417812b9787ab4492e2c590fcf1bf6f4b576e7 /docs/TR-20210808100000-IA-WDS-REFCAT
parentbb64b3aa62267676302e75f0ca44157b514beec4 (diff)
downloadrefcat-bd66b58cded2c2c7e7b7e5d374434d6531dd70de.tar.gz
refcat-bd66b58cded2c2c7e7b7e5d374434d6531dd70de.zip
docs: cleanup and naming
Diffstat (limited to 'docs/TR-20210808100000-IA-WDS-REFCAT')
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore5
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE21
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/Makefile17
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/README.md2
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdfbin0 -> 215353 bytes
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.pdfbin0 -> 95636 bytes
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex362
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib228
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty136
9 files changed, 771 insertions, 0 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore
new file mode 100644
index 0000000..5040d53
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore
@@ -0,0 +1,5 @@
+*.log
+*.aux
+*.bbl
+*.blg
+*.out
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE
new file mode 100644
index 0000000..9f5c70f
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Ruoho Ruotsi
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile
new file mode 100644
index 0000000..11264f8
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile
@@ -0,0 +1,17 @@
+main.pdf: main.tex
+ latexindent -w main.tex && rm -f main.bak*
+ pdflatex main.tex
+ bibtex main
+ pdflatex main.tex
+ pdflatex main.tex
+
+
+.PHONY: clean
+clean:
+ rm -f main.pdf
+ rm -f main.aux
+ rm -f main.log
+ rm -f main.bbl
+ rm -f main.blg
+ rm -f main.out
+
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/README.md b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md
new file mode 100644
index 0000000..3a56517
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md
@@ -0,0 +1,2 @@
+# latex-template-arxiv-preprint
+A simple LaTeX template for Technical Reports, arXiv preprints &amp; 2-column Conference papers
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf
new file mode 100644
index 0000000..b21876a
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf
Binary files differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
new file mode 100644
index 0000000..3b431cc
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
Binary files differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
new file mode 100644
index 0000000..e4febd9
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -0,0 +1,362 @@
+\documentclass[hidelinks,10pt,twocolumn]{article}
+\usepackage{simpleConference}
+\usepackage[utf8]{inputenc}
+\usepackage{times}
+\usepackage{graphicx}
+\usepackage{natbib}
+\usepackage{doi}
+\usepackage{amssymb}
+\usepackage{url,hyperref}
+\usepackage{booktabs} % professional-quality tables
+\usepackage{amsfonts} % blackboard math symbols
+\usepackage{nicefrac} % compact symbols for 1/2, etc.
+\usepackage{caption}
+
+\usepackage{datetime}
+\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1}
+\setlength{\parindent}{0pt}
+
+\begin{document}
+
+\title{Fatcat Reference Dataset}
+
+\author{Martin Czygan \\
+ \\
+ Internet Archive \\
+ San Francisco, California, USA \\
+ martin@archive.org \\
+ \and
+ Bryan Newbold \\
+ \\
+ Internet Archive \\
+ San Francisco, California, USA \\
+ bnewbold@archive.org \\
+ \\
+}
+
+
+\maketitle
+\thispagestyle{empty}
+
+
+\begin{abstract}
+ As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
+ graph dataset, named \emph{refcat}, derived from scholarly publications and
+ additional data sources. It is composed of data gathered by the fatcat
+ cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
+ crawls targeting primary and secondary scholarly outputs, as well as metadata
+ from the Open Library\footnote{\url{https://openlibrary.org}} project and
+ Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
+ graph consists of 1,323,423,672 citations. We release this dataset under a CC0
+ Public Domain Dedication, accessible through an archive
+ item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
+ code used in the derivation process is released under an MIT
+ license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
+\end{abstract}
+
+\keywords{Citation Graph, Web Archiving}
+
+\section{Introduction}
+
+
+The Internet Archive releases a first version of a citation graph dataset
+derived from a raw corpus of about 2.5B references gathered from metadata and
+data obtained by PDF extraction tools such as
+GROBID\cite{lopez2009grobid}. Additionally, we consider integration with
+metadata from Open Library and Wikipedia.
+The goal of this report is to describe briefly the current contents and the
+derivation of the dataset. We expect
+this dataset to be iterated upon, with changes both in content and processing.
+
+Modern citation indexes can be traced back to the early computing age, when
+projects like the Science Citation Index (1955)\citep{garfield2007evolution}
+were first devised, living on in existing commercial knowledge bases today.
+Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
+- the first version of which contained 6,325,178 individual
+references\citep{shotton2013publishing}. Other notable early projects
+include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
+decade has seen the emergence of more openly available, large scale
+citation projects like Microsoft Academic\citep{sinha2015overview} or the
+Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
+according to \citep{hutchins2021tipping} over 1B citations are publicly
+available, marking a tipping point for this category of data.
+
+\section{Related Work}
+
+There are a few large scale citation dataset available today. COCI, the
+``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first
+released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on
+2021-07-29, it contains
+1,094,394,688 citations across 65,835,422 bibliographic
+resources\citep{peroni2020opencitations}.
+
+The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
+``a Wikimedia initiative to develop open citations and linked bibliographic
+data to serve free knowledge'' continously adds citations to its database and
+as of 2021-06-28 tracks 253,719,394 citations across 39,994,937
+publications\footnote{\url{http://wikicite.org/statistics.html}}.
+
+Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of
+entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
+with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
+ \url{https://archive.org/details/mag-2021-06-07}} the
+\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
+bibliographic entities.
+
+Numerous other projects have been or are concerned with various aspects of
+citation discovery and curation as part their feature set, among them Semantic
+Scholar\citep{fricke2018semantic}, CiteSeerX\citep{li2006citeseerx} or Aminer\citep{tang2016aminer}.
+
+As mentioned in \citep{hutchins2021tipping}, the number of openly available
+citations is not expected to shrink in the future.
+
+
+\section{Dataset}
+
+We release the first version of the \emph{refcat} dataset in an format used
+internally for storage and to serve queries (and which we call \emph{biblioref}
+or \emph{bref} for short). The dataset includes metadata from fatcat, the
+Open Library Project and inbound links from the English Wikipedia. The fatcat
+project itself aggregates data from variety of open data sources, such as
+Crossref\citep{crossref}, PubMed\citep{canese2013pubmed},
+DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others,
+as well as metadata generated from analysis of data preserved at the Internet
+Archive and active crawls of publication sites on the web.
+
+The dataset is
+integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users
+to explore inbound and outbound references\cite{fatcatguidereferencegraph}.
+
+The format records source and target (fatcat release and work) identifiers, a
+few attributes from the metadata (such as year or release stage) as well as
+information about the match status and provanance.
+
+The dataset currently contains 1,323,423,672 citations across 76,327,662
+entities (55,123,635 unique source and 60,244,206 unique target work
+identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI
+for both source and target).
+The majority of matches - 1,250,523,321 - are established through identifier
+based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are
+established through fuzzy matching techniques.
+
+The majority of citations between \emph{refcat} and COCI overlap, as can be
+seen in~Table~\ref{table:cocicmp}.
+
+\begin{table}[]
+ \begin{center}
+ \begin{tabular}{ll}
+ \toprule
+ \bf{Set} & \bf{Count} \\
+
+ \midrule
+ COCI (C) & 1,094,394,688 \\
+ \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+ C $\cap$ R & 1,007,539,966 \\
+ C $\setminus$ R & 86,854,309 \\
+ R $\setminus$ C & 295,884,246
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Comparison between COCI and \emph{refcat-doi}, a subset of
+ \emph{refcat} where entities have a known DOI. At least 50\% of the
+ 295,884,246 references only in \emph{refcat-doi} come from links
+ recorded within a specific dataset provider (GBIF, DOI prefix:
+ 10.15468).}
+ \label{table:cocicmp}
+ \end{center}
+\end{table}
+
+% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst
+% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst
+
+
+\section{System Design}
+
+The constraints for the systems design are informed by the volume and the
+variety of the data. The capability to run the whole graph derivation on a
+single machine was a minor goal as well. In total, the raw inputs amount to a
+few terabytes of textual content, mostly newline delimited JSON. More
+importantly, while the number of data fields is low, certain schemas are very
+partial with hundreds of different combinations of available field values found
+in the raw reference data. This is most likely caused by aggregators passing on
+reference data coming from hundreds of sources, each of which not necessarily
+agreeing on a common granularity for citation data and from artifacts of
+machine learning based structured data extraction tools.
+
+Each combination of fields may require a slightly different processing path.
+For example, references with an Arxiv identifier can be processed differently
+from references with only a title. Over 50\% of the raw reference data comes
+from a set of eight field set manifestations, as listed in
+Table~\ref{table:fields}.
+
+\begin{table}[]
+ \begin{center}
+ \begin{tabular}{ll}
+ \toprule
+ \bf{Fields} & \bf{Percentage} \\
+ \midrule
+ \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
+ \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Top 8 combinations of available fields in raw reference data
+ accounting for about 53\% of the total data (CN = container name, CRN =
+ contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+ issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+ \label{table:fields}
+ \end{center}
+\end{table}
+
+Overall, a map-reduce style\citep{dean2010mapreduce} approach is
+followed\footnote{While the operations are similar, the processing is not
+ distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
+for some
+uniformity in the overall processing. We extract (key, document) tuples (as
+TSV) from the raw JSON data and sort by key. We then group documents with the
+same key and apply a function on each group in order to generate
+our target schema or perform
+additional operations such as deduplication or fusion of matched and unmatched references.
+
+The key derivation can be exact (via an identifier like DOI, PMID, etc) or
+based on a value normalization, like slugifying a title string. For identifier
+based matches we can generate the target schema directly. For fuzzy matching
+candidates, we pass possible match pairs through a verification procedure,
+which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a
+domain dependent rule based verification, able to identify different versions
+of a publication, preprint-published pairs and documents, which are
+are similar by various metrics calculated over title and author fields. The fuzzy matching
+approach is applied on all reference documents without identifier (a title is
+currently required).
+
+With a few schema conversions, fuzzy matching can be applied to Wikipedia
+articles and Open Library (edition) records as well. The aspect of precision
+and recall are represented by the two stages: we are generous in the match
+candidate generation phase in order to improve recall, but we are strict during
+verification, in order to control precision. Quality assurance for verification is
+implemented through a growing list of test cases of real examples from the catalog and
+their expected or desired match status\footnote{The list can be found under:
+ \url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
+ It is helpful to keep this test suite independent of any specific programming language.}.
+
+
+\section{Limitations and Future Work}
+
+As other dataset in this field we expect this dataset to be iterated upon.
+
+\begin{itemize}
+ \item The fatcat catalog updates its metadata
+ continously\footnote{A changelog can currenly be followed here:
+ \url{https://fatcat.wiki/changelog}} and web crawls are conducted
+ regularly. Current processing pipelines cover raw reference snapshot
+ creation and derivation of the graph structure, which allows to rerun
+ processing based on updated data as it becomes available.
+
+ \item Metadata extraction from PDFs depends on supervised machine learning
+ models, which in turn depend on available training datasets. With additional crawls and
+ metadata available we hope to improve models used for metadata
+ extraction, improving yield and reducing data extraction artifacts in
+ the process.
+
+ \item As of this version, a number of raw reference
+ docs remain unmatched, which means that neither exact nor fuzzy matching
+ has detected a link to a known entity. On the one
+ hand, this can hint at missing metadata. However, parts of the data
+ will contain a reference to a catalogued entity, but in a specific,
+ dense and harder to recover form.
+ This also include improvements to the fuzzy matching approach.
+ \item The reference dataset contains millions of URLs and their integration
+ into the graph has been implemented as prototype. A full implementation
+ requires a few data cleanup and normalization steps.
+\end{itemize}
+
+\section{Acknowledgements}
+
+This work is partially supported by a grant from the \emph{Andrew W. Mellon
+ Foundation}.
+
+
+\section{Appendix A}
+
+
+A note on data quality: While we implement various data quality measures,
+real-world data, especially coming from many different sources will contain
+issues. Among other measures, we keep track of match reasons,
+especially for fuzzy matching to be able to zoom in on systematic errors
+more easily (see~Table~\ref{table:matches}).
+
+\begin{table}[]
+ \footnotesize
+ \captionsetup{font=normalsize}
+ \begin{center}
+ \begin{tabular}{@{}rlll@{}}
+ \toprule
+ \textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule
+ 934932865 & crossref & exact & doi \\
+ 151366108 & fatcat-datacite & exact & doi \\
+ 65345275 & fatcat-pubmed & exact & pmid \\
+ 48778607 & fuzzy & strong & jaccardauthors \\
+ 42465250 & grobid & exact & doi \\
+ 29197902 & fatcat-pubmed & exact & doi \\
+ 19996327 & fatcat-crossref & exact & doi \\
+ 11996694 & fuzzy & strong & slugtitleauthormatch \\
+ 9157498 & fuzzy & strong & tokenizedauthors \\
+ 3547594 & grobid & exact & arxiv \\
+ 2310025 & fuzzy & exact & titleauthormatch \\
+ 1496515 & grobid & exact & pmid \\
+ 680722 & crossref & strong & jaccardauthors \\
+ 476331 & fuzzy & strong & versioneddoi \\
+ 449271 & grobid & exact & isbn \\
+ 230645 & fatcat-crossref & strong & jaccardauthors \\
+ 190578 & grobid & strong & jaccardauthors \\
+ 156657 & crossref & exact & isbn \\
+ 123681 & fatcat-pubmed & strong & jaccardauthors \\
+ 79328 & crossref & exact & arxiv \\
+ 57414 & crossref & strong & tokenizedauthors \\
+ 53480 & fuzzy & strong & pmiddoipair \\
+ 52453 & fuzzy & strong & dataciterelatedid \\
+ 47119 & grobid & strong & slugtitleauthormatch \\
+ 36774 & fuzzy & strong & arxivversion \\
+ % 35311 & fuzzy & strong & customieeearxiv \\
+ % 33863 & grobid & exact & pmcid \\
+ % 23504 & crossref & strong & slugtitleauthormatch \\
+ % 22753 & fatcat-crossref & strong & tokenizedauthors \\
+ % 17720 & grobid & exact & titleauthormatch \\
+ % 14656 & crossref & exact & titleauthormatch \\
+ % 14438 & grobid & strong & tokenizedauthors \\
+ % 7682 & fatcat-crossref & exact & arxiv \\
+ % 5972 & fatcat-crossref & exact & isbn \\
+ % 5525 & fatcat-pubmed & exact & arxiv \\
+ % 4290 & fatcat-pubmed & strong & tokenizedauthors \\
+ % 2745 & fatcat-pubmed & exact & isbn \\
+ % 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\
+ % 2273 & fatcat-crossref & strong & slugtitleauthormatch \\
+ % 1960 & fuzzy & exact & workid \\
+ % 1150 & fatcat-crossref & exact & titleauthormatch \\
+ % 1041 & fatcat-pubmed & exact & titleauthormatch \\
+ % 895 & fuzzy & strong & figshareversion \\
+ % 317 & fuzzy & strong & titleartifact \\
+ % 82 & grobid & strong & titleartifact \\
+ % 33 & crossref & strong & titleartifact \\
+ % 5 & fuzzy & strong & custombsiundated \\
+ % 1 & fuzzy & strong & custombsisubdoc \\
+ % 1 & fatcat & exact & doi \\ \bottomrule
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Table of match counts (top 25), reference provenance, match status and
+ match reason. The match reason identifier encode a specific rule in the domain
+ dependent verification process and are included for completeness - we do not
+ include the details of each rule in this report.}
+ \label{table:matches}
+ \end{center}
+\end{table}
+
+\bibliographystyle{abbrv}
+% \bibliographystyle{plainnat}
+\bibliography{refs}
+\end{document}
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
new file mode 100644
index 0000000..c61021e
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
@@ -0,0 +1,228 @@
+@inproceedings{kour2014real,
+ title={Real-time segmentation of on-line handwritten arabic script},
+ author={Kour, George and Saabne, Raid},
+ booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on},
+ pages={417--422},
+ year={2014},
+ organization={IEEE}
+}
+
+@inproceedings{kour2014fast,
+ title={Fast classification of handwritten on-line Arabic characters},
+ author={Kour, George and Saabne, Raid},
+ booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of},
+ pages={312--318},
+ year={2014},
+ organization={IEEE},
+ doi={10.1109/SOCPAR.2014.7008025}
+}
+
+@article{hadash2018estimate,
+ title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications},
+ author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon},
+ journal={arXiv preprint arXiv:1804.09028},
+ year={2018}
+}
+
+@article{garfield1955citation,
+ title={Citation indexes for science},
+ author={Garfield, Eugene},
+ journal={Science},
+ volume={122},
+ number={3159},
+ pages={108--111},
+ year={1955},
+ publisher={JSTOR}
+}
+
+@inproceedings{lopez2009grobid,
+ title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications},
+ author={Lopez, Patrice},
+ booktitle={International conference on theory and practice of digital libraries},
+ pages={473--474},
+ year={2009},
+ organization={Springer}
+}
+
+@article{garfield2007evolution,
+ title={The evolution of the science citation index},
+ author={Garfield, Eugene},
+ journal={International microbiology},
+ volume={10},
+ number={1},
+ pages={65},
+ year={2007}
+}
+
+@article{shotton2013publishing,
+ title={Publishing: open citations},
+ author={Shotton, David},
+ journal={Nature News},
+ volume={502},
+ number={7471},
+ pages={295},
+ year={2013}
+}
+
+@misc{CitEc,
+ title = {Citations in Economics},
+ howpublished = {\url{https://citec.repec.org/}},
+ note = {Accessed: 2021-07-30}
+}
+
+@inproceedings{wu2019citeseerx,
+ title={CiteSeerX: 20 years of service to scholarly big data},
+ author={Wu, Jian and Kim, Kunho and Giles, C Lee},
+ booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse},
+ pages={1--4},
+ year={2019}
+}
+
+@inproceedings{li2006citeseerx,
+ title={CiteSeerx: an architecture and web service design for an academic document search engine},
+ author={Li, Huajing and Councill, Isaac and Lee, Wang-Chien and Giles, C Lee},
+ booktitle={Proceedings of the 15th international conference on World Wide Web},
+ pages={883--884},
+ year={2006}
+}
+
+
+@inproceedings{sinha2015overview,
+ title={An overview of microsoft academic service (mas) and applications},
+ author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan},
+ booktitle={Proceedings of the 24th international conference on world wide web},
+ pages={243--246},
+ year={2015}
+}
+
+@misc{i4oc,
+ title = {Initiative for Open Citations},
+howpublished = {\url{https://i4oc.org/}},
+note = {Accessed: 2021-07-30}
+}
+
+@misc{fatcatguidereferencegraph,
+title = {The Fatcat Guide: Reference Graph (refcat)},
+howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}},
+note = {Accessed: 2021-08-08}
+}
+
+@misc{crossref,
+title = {Crossref},
+howpublished = {\url{https://crossref.org}},
+note = {Accessed: 2021-08-08}
+}
+
+@misc{doaj,
+title = {Directory of Open Access Journals},
+howpublished = {\url{https://doaj.org}},
+note = {Accessed: 2021-08-08}
+}
+
+@inproceedings{ley2002dblp,
+ title={The DBLP computer science bibliography: Evolution, research issues, perspectives},
+ author={Ley, Michael},
+ booktitle={International symposium on string processing and information retrieval},
+ pages={1--10},
+ year={2002},
+ organization={Springer}
+}
+
+
+@inproceedings{brase2009datacite,
+ title={DataCite-A global registration agency for research data},
+ author={Brase, Jan},
+ booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology},
+ pages={257--261},
+ year={2009},
+ organization={IEEE}
+}
+
+@article{canese2013pubmed,
+ title={PubMed: the bibliographic database},
+ author={Canese, Kathi and Weis, Sarah},
+ journal={The NCBI Handbook},
+ volume={2},
+ pages={1},
+ year={2013},
+ publisher={National Center for Biotechnology Information (US)}
+}
+
+
+@article{shotton2018funders,
+ title={Funders should mandate open citations.},
+ author={Shotton, David},
+ journal={Nature},
+ volume={553},
+ number={7686},
+ pages={129--130},
+ year={2018},
+ publisher={Nature Publishing Group}
+}
+
+@article{hutchins2021tipping,
+ title={A tipping point for open citation data},
+ author={Hutchins, B Ian},
+ journal={Quantitative Science Studies},
+ pages={1--5},
+ year={2021}
+}
+
+@article{silbert1970world,
+ title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)},
+ author={Silbert, Jeffrey M},
+ journal={Criminology},
+ volume={8},
+ pages={107},
+ year={1970},
+ publisher={HeinOnline}
+}
+
+@article{peroni2020opencitations,
+ title={OpenCitations, an infrastructure organization for open scholarship},
+ author={Peroni, Silvio and Shotton, David},
+ journal={Quantitative Science Studies},
+ volume={1},
+ number={1},
+ pages={428--444},
+ year={2020},
+ publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…}
+}
+
+@article{fricke2018semantic,
+ title={Semantic scholar},
+ author={Fricke, Suzanne},
+ journal={Journal of the Medical Library Association: JMLA},
+ volume={106},
+ number={1},
+ pages={145},
+ year={2018},
+ publisher={Medical Library Association}
+}
+
+@inproceedings{tang2016aminer,
+ title={AMiner: Toward understanding big scholar data},
+ author={Tang, Jie},
+ booktitle={Proceedings of the ninth ACM international conference on web search and data mining},
+ pages={467--467},
+ year={2016}
+}
+
+@article{dean2010mapreduce,
+ title={MapReduce: a flexible data processing tool},
+ author={Dean, Jeffrey and Ghemawat, Sanjay},
+ journal={Communications of the ACM},
+ volume={53},
+ number={1},
+ pages={72--77},
+ year={2010},
+ publisher={ACM New York, NY, USA}
+}
+
+@article{collet2018zstandard,
+ title={Zstandard Compression and the application/zstd Media Type},
+ author={Collet, Yann and Kucherawy, Murray},
+ journal={RFC 8478},
+ year={2018}
+}
+
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty
new file mode 100644
index 0000000..d4d4764
--- /dev/null
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty
@@ -0,0 +1,136 @@
+% ---------------------------------------------------------------
+% Style file for simple, two column conference papers.
+% Based on latex8.sty by Paolo.Ienne@di.epfl.ch
+% ---------------------------------------------------------------
+% Use with LaTeX2e as:
+% \documentclass[times,10pt,twocolumn]{article}
+% \usepackage{simpleConference}
+% \usepackage{times}
+% ---------------------------------------------------------------
+% specify references as
+% \bibliographystyle{simpleConference}
+% \bibliography{...your files...}
+%
+% use Section{} and SubSection{} instead of standard section{}
+% and subsection{} to obtain headings in the form
+% "1.3. My heading"
+% ---------------------------------------------------------------
+% ten point helvetica bold required for captions
+% in some sites the name of the helvetica bold font may differ,
+% change the name here:
+\font\tenhv = phvb at 10pt
+
+% eleven point times bold required for second-order headings
+\font\elvbf = ptmb scaled 1100
+
+% set dimensions of columns, gap between columns, and paragraph indent
+\setlength{\textheight}{8.875in}
+\setlength{\textwidth}{6.875in}
+\setlength{\columnsep}{0.3125in}
+\setlength{\topmargin}{0in}
+\setlength{\headheight}{0in}
+\setlength{\headsep}{0in}
+\setlength{\parindent}{1pc}
+\setlength{\oddsidemargin}{-.304in}
+\setlength{\evensidemargin}{-.304in}
+
+% memento from size10.clo
+% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt}
+% \small{\@setfontsize\small\@ixpt{11}}
+% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}}
+% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt}
+% \tiny{\@setfontsize\tiny\@vpt\@vipt}
+% \large{\@setfontsize\large\@xiipt{14}}
+% \Large{\@setfontsize\Large\@xivpt{18}}
+% \LARGE{\@setfontsize\LARGE\@xviipt{22}}
+% \huge{\@setfontsize\huge\@xxpt{25}}
+% \Huge{\@setfontsize\Huge\@xxvpt{30}}
+
+\def\@maketitle
+ {
+ \newpage
+ \null
+ \vskip .375in
+ \begin{center}
+ {\Large \bf \@title \par}
+ % additional two empty lines at the end of the title
+ \vspace*{24pt}
+ {
+ \large
+ \lineskip .5em
+ \begin{tabular}[t]{c}
+ \@author
+ \end{tabular}
+ \par
+ }
+ % additional small space at the end of the author name
+ \vskip .5em
+ {
+ \large
+ \begin{tabular}[t]{c}
+ \@affiliation
+ \end{tabular}
+ \par
+ \ifx \@empty \@email
+ \else
+ \begin{tabular}{r@{~}l}
+ E-mail: & {\tt \@email}
+ \end{tabular}
+ \par
+ \fi
+ }
+ % additional empty line at the end of the title block
+ \vspace*{12pt}
+ \end{center}
+ }
+
+\def\abstract
+ {%
+ \centerline{\large\bf Abstract}%
+ \vspace*{12pt}%
+% \it% %%%% iroro - commenting out italicized abstract
+ }
+
+\def\endabstract
+ {
+ % additional empty line at the end of the abstract
+ \vspace*{12pt}
+ }
+
+\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{}
+
+\def\email#1{\gdef\@email{#1}}
+\gdef\@email{}
+
+\newlength{\@ctmp}
+\newlength{\@figindent}
+\setlength{\@figindent}{1pc}
+
+\long\def\@makecaption#1#2{
+ \vskip 10pt
+ \setbox\@tempboxa\hbox{\tenhv\noindent #1.~#2}
+ \setlength{\@ctmp}{\hsize}
+ \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent}
+ % IF longer than one indented paragraph line
+ \ifdim \wd\@tempboxa >\@ctmp
+ % THEN set as an indented paragraph
+ \begin{list}{}{\leftmargin\@figindent \rightmargin\leftmargin}
+ \item[]\tenhv #1.~#2\par
+ \end{list}
+ \else
+ % ELSE center
+ \hbox to\hsize{\hfil\box\@tempboxa\hfil}
+ \fi}
+
+% correct heading spacing and type
+\def\section{\@startsection {section}{1}{\z@}
+ {14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}}
+\def\subsection{\@startsection {subsection}{2}{\z@}
+ {13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}}
+
+% add the period after section numbers
+\newcommand{\Section}[1]{\section{\hskip -1em.~#1}}
+\newcommand{\SubSection}[1]{\subsection{\hskip -1em.~#1}}
+
+% end of file latex8.sty
+% ---------------------------------------------------------------