aboutsummaryrefslogtreecommitdiffstats
path: root/docs/TR-20210730212057-IA-WDS-CG/main.tex
blob: a7edac33b8764dc7049f11c6d1b88da2e617d313 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
\documentclass{article}



\usepackage{arxiv}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{lipsum}		% Can be removed after putting your text content
\usepackage{graphicx}
\usepackage{natbib}
\usepackage{doi}

\title{Internet Archive Scholar Citation Graph Dataset}

\date{August 10, 2021}	% Here you can change the date presented in the paper title
%\date{} 					% Or removing it

\author{ Martin Czygan \\
	Internet Archive\\
	San Francisco, CA 94118 \\
	\texttt{martin@archive.org} \\
	%% examples of more authors
	\And
	Bryan Newbold \\
	Internet Archive\\
	San Francisco, CA 94118 \\
	\texttt{bnewbold@archive.org} \\
	% \And
	% Helge Holzmann \\
	% Internet Archive\\
	% San Francisco, CA 94118 \\
	% \texttt{helge@archive.org} \\
	% \And
	% Jefferson Bailey \\
	% Internet Archive\\
	% San Francisco, CA 94118 \\
	% \texttt{jefferson@archive.org} \\
	%% \AND
	%% Coauthor \\
	%% Affiliation \\
	%% Address \\
	%% \texttt{email} \\
	%% \And
	%% Coauthor \\
	%% Affiliation \\
	%% Address \\
	%% \texttt{email} \\
	%% \And
	%% Coauthor \\
	%% Affiliation \\
	%% Address \\
	%% \texttt{email} \\
}

% Uncomment to remove the date
%\date{}

% Uncomment to override  the `A preprint' in the header
\renewcommand{\headeright}{Technical Report}
\renewcommand{\undertitle}{Technical Report}
% \renewcommand{\shorttitle}{\textit{arXiv} Template}

%%% Add PDF metadata to help others organize their library
%%% Once the PDF is generated, you can check the metadata with
%%% $ pdfinfo template.pdf
\hypersetup{
pdftitle={Internet Archive Scholar Citation Graph Dataset},
pdfsubject={cs.DL, cs.IR},
pdfauthor={Martin Czygan, Bryan Newbold, Helge Holzmann, Jefferson Bailey},
pdfkeywords={Web Archiving, Citation Graph},
}

\begin{document}
\maketitle

\begin{abstract}
As part of its scholarly data efforts, the Internet Archive releases a citation
graph dataset derived from scholarly publications and additional data sources. It is
composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related
web-scale crawls targeting primary and secondary scholarly outputs. In
addition, relations are worked out between scholarly publications, web pages
and their archived copies, books from the Open Library project as well as
Wikipedia articles. This first version of the graph consists of over X nodes
and over Y edges. We release this dataset under a Z open license under the
collection at \href{https://archive.org/details/TODO-citation\_graph}{https://archive.org/details/TODO-citation\_graph}, as well as all code
used for derivation under an MIT license.
\end{abstract}


% keywords can be removed
\keywords{Citation Graph \and Scholarly Communications \and Web Archiving}


\section{Introduction}

The Internet Archive releases a first version of a citation graph dataset
derived from a raw corpus of about 2.5B references gathered from metadata and
from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}.
The goal of this report is to describe briefly the current contents and the
derivation of the Archive Scholar Citations Dataset (ASC). We expect
this dataset to be iterated upon, with changes both in content and processing.

Modern citation indexes can be traced back to the early computing age, when
projects like the Science Citation Index (1955)\citep{garfield2007evolution}
were first devised, living on in existing commercial knowledge bases today.
Open alternatives were started such as the Open Citations Corpus (OCC) in 2010
- the first version of which contained 6,325,178 individual
references\citep{shotton2013publishing}. Other notable sources from that time
include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last
decade has seen an increase of more openly available reference dataset and
citation projects, like Microsoft Academic\citep{sinha2015overview} and
Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021,
according to \citep{hutchins2021tipping} over 1B citations are publicly
available, marking a tipping point for open citations.



\section{Citation Graph Contents}



% * edges
% * edges exact
% * edges fuzzy
% * edges fuzzy reason (table)
% * number of source docs
% * number of target docs
% * refs to papers
% * refs to books
% * refs to web pages
% * refs to web pages that have been archived
% * refs to web pages that have been archived but not on liveweb any more
%
% Overlaps
%
% * how many edges can be found in COCI as well
% * how many edges can be found in MAG as well
% * how many unique to us edges
%
% Additional numbers
%
% * number of unparsed refs
% * "biblio" field distribution of unparted refs
%
% Potential routes
%
% * journal abbreviation parsing with suffix arrays
% * lookup by name, year and journal


\section{System Design}

The constraints for the systems design are informed by the volume and the
variety of the data. In total, the raw inputs amount to a few TB of textual
content, mostly newline delimited JSON. More importantly, while the number of
data fields is low, certain schemas are very partial with hundreds of different
combinations of available field values found in the raw reference data. This is
most likely caused by aggregators passing on reference data coming from
hundreds of sources, each of which not necessarily agreeing on a common
granularity for citation data and from artifacts of machine learning based
structured data extraction tools.

Each combination of fields may require a slightly different processing path.
For example, references with an Arxiv identifier can be processed differently
from references with only a title. Over 50\% of the raw reference data comes
from a set of eight field manifestations, as listed in
Table~\ref{table:fields}.

\begin{table}[]
    \begin{center}
    \begin{tabular}{ll}
\toprule
        \bf{Fields}                                    & \bf{Share} \\
\midrule
        \multicolumn{1}{l}{CN|CRN|P|T|U|V|Y}    & 14\%                              \\
        \multicolumn{1}{l}{DOI}                 & 14\%                              \\
        \multicolumn{1}{l}{CN|CRN|IS|P|T|U|V|Y} & 5\%                               \\
        \multicolumn{1}{l}{CN|CRN|DOI|U|V|Y}    & 4\%                               \\
        \multicolumn{1}{l}{PMID|U}              & 4\%                               \\
        \multicolumn{1}{l}{CN|CRN|DOI|T|V|Y}    & 4\%                               \\
        \multicolumn{1}{l}{CN|CRN|Y}            & 4\%                               \\
        \multicolumn{1}{l}{CN|CRN|DOI|V|Y}      & 4\%                               \\
    \end{tabular}
    \vspace*{2mm}
    \caption{Top 8 combinations of available fields in raw reference data
        accounting for about 53\% of the total data (CN = container name, CRN =
contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value.}
    \label{table:fields}
\end{center}
\end{table}

Overall, a map-reduce style approach is followed, which allows for some
uniformity in the overall processing. We extract (key, document) tuples (as
TSV) from the raw JSON data and sort by key. Then we group documents with the
same key into groups and apply a function on each group in order to generate
our target schema (currently named biblioref, or bref for short) or perform
addition operations (such as deduplication).

The key derivation can be exact (like an identifier like DOI, PMID, etc) or
based on a normalization procedure, like a slugified title string. For
identifier based matches we can generate the target biblioref schema directly.
For fuzzy matching candidates, we pass possible match pairs through a
verification procedure, which is implemented for release entity schema pairs.
The current verification procedure is a domain dependent rule based
verification, able to identify different versions of a publication,
preprint-published pairs or or other kind of similar documents by calculating
similarity metrics across title and authors. The fuzzy matching approach is
applied on all reference documents, which only have a title, but no identifier.

With a few schema conversions, fuzzy matching can be applied to Wikipedia
articles and Open Library (edition) records as well. The aspect of precision
and recall are represented by the two stages: we are generous in the match
candidate generation phase in order to improve recall, but we are strict during
verification, in order to control precision.

\section{Fuzzy Matching Approach}

% Take sample of 100 docs, report some precision, recall, F1 on a hand curated
% small subset.

The fuzzy matching approach currently implemented works in two phases: match
candidate generation and verification. For candidate generation, we map each
document to a key. We implemented a number of algorithms to form these
clusters, e.g. title normalizations (including lowercasing, whitespace removal,
unicode normalization and other measures) or transformations like
NYSIIS\citep{silbert1970world}.

The verification approach is based on a set of rules, which are tested
sequentially, yielding a match signal from weak to exact. We use a suite of
over 300 manually curated match examples\footnote{The table can be found here:
\href{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}}
as part of a unit test suite to allow for a controlled, continuous adjustement
to the verification procedure. If the verification yields either an exact or
strong signal, we include consider it a match.

We try to keep the processing steps performant to keep the overall derivation
time limited. Map and reduce operations are parallelized and certain processing
steps can process 100K documents per second or even more on commodity hardware
with spinning disks.

\section{Quality Assurance}

Understanding data quality plays a role, as the data is coming from a myriad of
sources, each with possible idiosyncratic features or missing values. We employ
a few QA measures during the process. First, we try to pass each data item
through only one processing pipeline (e.g. items matched by any identifier
should not even be considered for fuzzy matching). If duplicate links appear in
the final dataset nonetheless, we remove them, prefering exact over fuzzy matches.

We employ a couple of data cleaning techniques, e.g. to find and verify
identifiers like ISBN or to sanitize URLs found in the data. Many of these
artifacts stem from the fact that large chunks of the raw data come from
heuristic data extraction from PDF documents.


\section{Discussion}

% need to iterate

%\lipsum[2] %\lipsum[3]


% \section{Headings: first level} % \label{sec:headings}
%
% \lipsum[4] See Section \ref{sec:headings}.
%
% \subsection{Headings: second level}
% \lipsum[5]
% \begin{equation}
% 	\xi _{ij}(t)=P(x_{t}=i,x_{t+1}=j|y,v,w;\theta)= {\frac {\alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}{\sum _{i=1}^{N} \sum _{j=1}^{N} \alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}}
% \end{equation}
%
% \subsubsection{Headings: third level}
% \lipsum[6]
%
% \paragraph{Paragraph}
% \lipsum[7]
%
%
%
% \section{Examples of citations, figures, tables, references}
% \label{sec:others}
%
% \subsection{Citations}
% Citations use \verb+natbib+. The documentation may be found at
% \begin{center}
% 	\url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf}
% \end{center}
%
% Here is an example usage of the two main commands (\verb+citet+ and \verb+citep+): Some people thought a thing \citep{kour2014real, hadash2018estimate} but other people thought something else \citep{kour2014fast}. Many people have speculated that if we knew exactly why \citet{kour2014fast} thought this\dots
%
% \subsection{Figures}
% \lipsum[10]
% See Figure \ref{fig:fig1}. Here is how you add footnotes. \footnote{Sample of the first footnote.}
% \lipsum[11]
%
% \begin{figure}
% 	\centering
% 	\fbox{\rule[-.5cm]{4cm}{4cm} \rule[-.5cm]{4cm}{0cm}}
% 	\caption{Sample figure caption.}
% 	\label{fig:fig1}
% \end{figure}
%
% \subsection{Tables}
% See awesome Table~\ref{tab:table}.
%
% The documentation for \verb+booktabs+ (`Publication quality tables in LaTeX') is available from:
% \begin{center}
% 	\url{https://www.ctan.org/pkg/booktabs}
% \end{center}
%
%
% \begin{table}
% 	\caption{Sample table title}
% 	\centering
% 	\begin{tabular}{lll}
% 		\toprule
% 		\multicolumn{2}{c}{Part}                   \\
% 		\cmidrule(r){1-2}
% 		Name     & Description     & Size ($\mu$m) \\
% 		\midrule
% 		Dendrite & Input terminal  & $\sim$100     \\
% 		Axon     & Output terminal & $\sim$10      \\
% 		Soma     & Cell body       & up to $10^6$  \\
% 		\bottomrule
% 	\end{tabular}
% 	\label{tab:table}
% \end{table}
%
% \subsection{Lists}
% \begin{itemize}
% 	\item Lorem ipsum dolor sit amet
% 	\item consectetur adipiscing elit.
% 	\item Aliquam dignissim blandit est, in dictum tortor gravida eget. In ac rutrum magna.
% \end{itemize}


\bibliographystyle{unsrtnat}
\bibliography{references}  %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) .


%%% Uncomment this section and comment out the \bibliography{references} line above to use inline references.
% \begin{thebibliography}{1}

% 	\bibitem{kour2014real}
% 	George Kour and Raid Saabne.
% 	\newblock Real-time segmentation of on-line handwritten arabic script.
% 	\newblock In {\em Frontiers in Handwriting Recognition (ICFHR), 2014 14th
% 			International Conference on}, pages 417--422. IEEE, 2014.

% 	\bibitem{kour2014fast}
% 	George Kour and Raid Saabne.
% 	\newblock Fast classification of handwritten on-line arabic characters.
% 	\newblock In {\em Soft Computing and Pattern Recognition (SoCPaR), 2014 6th
% 			International Conference of}, pages 312--318. IEEE, 2014.

% 	\bibitem{hadash2018estimate}
% 	Guy Hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George Kour, and Alon
% 	Jacovi.
% 	\newblock Estimate and replace: A novel approach to integrating deep neural
% 	networks with existing applications.
% 	\newblock {\em arXiv preprint arXiv:1804.09028}, 2018.

% \end{thebibliography}

\section{Appendix}

% Please add the following required packages to your document preamble:
\begin{table}[]
    \begin{center}
\begin{tabular}{@{}rlll@{}}
\toprule
\textbf{Number of matches} & \textbf{Citation Provenance} & \textbf{Match Status} & \textbf{Match Reason} \\ \midrule
934932865                  & crossref                  & exact                 & doi                   \\
151366108                  & fatcat-datacite           & exact                 & doi                   \\
65345275                   & fatcat-pubmed             & exact                 & pmid                  \\
48778607                   & fuzzy                     & strong                & jaccardauthors        \\
42465250                   & grobid                    & exact                 & doi                   \\
29197902                   & fatcat-pubmed             & exact                 & doi                   \\
19996327                   & fatcat-crossref           & exact                 & doi                   \\
11996694                   & fuzzy                     & strong                & slugtitleauthormatch  \\
9157498                    & fuzzy                     & strong                & tokenizedauthors      \\
3547594                    & grobid                    & exact                 & arxiv                 \\
2310025                    & fuzzy                     & exact                 & titleauthormatch      \\
1496515                    & grobid                    & exact                 & pmid                  \\
680722                     & crossref                  & strong                & jaccardauthors        \\
476331                     & fuzzy                     & strong                & versioneddoi          \\
449271                     & grobid                    & exact                 & isbn                  \\
230645                     & fatcat-crossref           & strong                & jaccardauthors        \\
190578                     & grobid                    & strong                & jaccardauthors        \\
156657                     & crossref                  & exact                 & isbn                  \\
123681                     & fatcat-pubmed             & strong                & jaccardauthors        \\
79328                      & crossref                  & exact                 & arxiv                 \\
57414                      & crossref                  & strong                & tokenizedauthors      \\
53480                      & fuzzy                     & strong                & pmiddoipair           \\
52453                      & fuzzy                     & strong                & dataciterelatedid     \\
47119                      & grobid                    & strong                & slugtitleauthormatch  \\
36774                      & fuzzy                     & strong                & arxivversion          \\
35311                      & fuzzy                     & strong                & customieeearxiv       \\
33863                      & grobid                    & exact                 & pmcid                 \\
23504                      & crossref                  & strong                & slugtitleauthormatch  \\
22753                      & fatcat-crossref           & strong                & tokenizedauthors      \\
17720                      & grobid                    & exact                 & titleauthormatch      \\
14656                      & crossref                  & exact                 & titleauthormatch      \\
14438                      & grobid                    & strong                & tokenizedauthors      \\
7682                       & fatcat-crossref           & exact                 & arxiv                 \\
5972                       & fatcat-crossref           & exact                 & isbn                  \\
5525                       & fatcat-pubmed             & exact                 & arxiv                 \\
4290                       & fatcat-pubmed             & strong                & tokenizedauthors      \\
2745                       & fatcat-pubmed             & exact                 & isbn                  \\
2342                       & fatcat-pubmed             & strong                & slugtitleauthormatch  \\
2273                       & fatcat-crossref           & strong                & slugtitleauthormatch  \\
1960                       & fuzzy                     & exact                 & workid                \\
1150                       & fatcat-crossref           & exact                 & titleauthormatch      \\
1041                       & fatcat-pubmed             & exact                 & titleauthormatch      \\
895                        & fuzzy                     & strong                & figshareversion       \\
317                        & fuzzy                     & strong                & titleartifact         \\
82                         & grobid                    & strong                & titleartifact         \\
33                         & crossref                  & strong                & titleartifact         \\
5                          & fuzzy                     & strong                & custombsiundated      \\
1                          & fuzzy                     & strong                & custombsisubdoc       \\
1                          & fatcat                    & exact                 & doi                   \\ \bottomrule
\end{tabular}
    \vspace*{2mm}
	\caption{Table of match counts, reference provenance, match status and
match reason. The match reason identifier encode a specific rule in the domain
dependent verification process and are included for completeness - we do not
include the details of each rule in this report.}
    \label{table:fields}
\end{center}
\end{table}


\end{document}