From 171678d962a49d5ae05e586702d09ccac3f08525 Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Mon, 6 Sep 2021 18:40:54 +0200
Subject: docs: related work

---
 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 96426 -> 97899 bytes
 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex |  86 +++++++++++++++++---------
 docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib |  76 +++++++++++++++++++++++
 3 files changed, 133 insertions(+), 29 deletions(-)

(limited to 'docs')

diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 9fafcc0..6ff65d0 100644
Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index ab72699..2a60a77 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -87,9 +87,9 @@ Initiative for Open Citations\footnote{\url{https://i4oc.org}}~\citep{shotton201
 In 2021, over one billion citations are publicly available, marking a ``tipping point''
 for this category of data~\citep{hutchins2021tipping}.
 
-While a paper will mainly cite other papers, more citable entities exist such
-as books and web links and within links a variety of targets, such as web
-sites, reference entries, protocols or datasets. References can be extracted
+While a paper will often cite other papers, more citable entities exist such
+as books or web links and within links a variety of targets, such as web
+pages, reference entries, protocols or datasets. References can be extracted
 manually or through more automated methods, such as metadata access and
 structured data extraction from full text documents; the latter offering the
 benefits of scalability. The completeness of bibliographic metadata ranges from
@@ -98,32 +98,60 @@ strings partially describing a publication.
 
 \section{Related Work}
 
-There are a few large scale citation dataset available today. COCI, the
-``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first
-released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on
-2021-07-29, it contains
-1,094,394,688 citations across 65,835,422 bibliographic
-resources~\citep{peroni2020opencitations}.
-
-The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
-``a Wikimedia initiative to develop open citations and linked bibliographic
-data to serve free knowledge'' continously adds citations to its database and
-as of 2021-06-28 tracks 253,719,394 citations across 39,994,937
-publications\footnote{\url{http://wikicite.org/statistics.html}}.
-
-Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of
-entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
-with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
-	\url{https://archive.org/details/mag-2021-06-07}}  the
-\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
-bibliographic entities.
-
-Numerous other projects have been or are concerned with various aspects of
-citation discovery and curation as part their feature set, among them Semantic
-Scholar~\citep{fricke2018semantic}, CiteSeerX~\citep{li2006citeseerx} or Aminer~\citep{tang2016aminer}.
-
-As mentioned in~\citep{hutchins2021tipping}, the number of openly available
-citations is not expected to shrink in the future.
+Typical problems arising in the process of compiling a citation graph dataset
+are data aquisition and citation matching. Data acquisition itself can take
+different forms: bibliographic metadata can contain explicit reference data as
+provided by publishers and aggregators; this data can be relatively consistent
+when looked at per source, but may vary in style and comprehensiveness when
+looked at as a whole. Another way of acquiring bibliographic metadata is to
+analyze a source document, such as a PDF (or its text), directly. Tools in this
+category are often based on conditial random
+fields~\citep{lafferty2001conditional} and have been implemented in projects
+such as ParsCit~\citep{councill2008parscit},
+Cermine~\citep{tkaczyk2014cermine}, EXCITE~\citep{hosseini2019excite}
+or GROBID~\citep{lopez2009grobid}.
+
+The problem of citation matching is relatively simple when common, persistent
+identifiers are present in the data. Complications mount, when there is
+\emph{Identity Uncertainty}, that is ``objects are not labeled with unique
+identifiers or when those identifiers may not be perceived
+perfectly''~\citep{pasula2003identity}. CiteSeer has been an early project
+concerned with citation matching~\citep{giles1998citeseer}. A taxonomy of
+potential issues common in the matching process has been compiled
+by~\citep{olensky2016evaluation}.  Additional care is required, when the
+citation matching process is done at scale~\citep{fedoryszak2013large}. The
+problem of heterogenity has been discussed in the context of datasets
+by~\citep{mathiak2015challenges}.
+
+
+
+
+% There are a few large scale citation dataset available today. COCI, the
+% ``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first
+% released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on
+% 2021-07-29, it contains
+% 1,094,394,688 citations across 65,835,422 bibliographic
+% resources~\citep{peroni2020opencitations}.
+%
+% The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project,
+% ``a Wikimedia initiative to develop open citations and linked bibliographic
+% data to serve free knowledge'' continously adds citations to its database and
+% as of 2021-06-28 tracks 253,719,394 citations across 39,994,937
+% publications\footnote{\url{http://wikicite.org/statistics.html}}.
+%
+% Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of
+% entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
+% with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
+% 	\url{https://archive.org/details/mag-2021-06-07}}  the
+% \emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
+% bibliographic entities.
+%
+% Numerous other projects have been or are concerned with various aspects of
+% citation discovery and curation as part their feature set, among them Semantic
+% Scholar~\citep{fricke2018semantic}, CiteSeerX~\citep{li2006citeseerx} or Aminer~\citep{tang2016aminer}.
+%
+% As mentioned in~\citep{hutchins2021tipping}, the number of openly available
+% citations is not expected to shrink in the future.
 
 
 \section{Dataset}
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
index f927ea4..9cfb32b 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib
@@ -271,3 +271,79 @@
 	year={2014},
 	month={May}
 }
+
+@inproceedings{councill2008parscit,
+	title={ParsCit: an Open-source CRF Reference String Parsing Package.},
+	author={Councill, Isaac G and Giles, C Lee and Kan, Min-Yen},
+	booktitle={LREC},
+	volume={8},
+	pages={661--667},
+	year={2008}
+}
+
+@article{lafferty2001conditional,
+	title={Conditional random fields: Probabilistic models for segmenting and labeling sequence data},
+	author={Lafferty, John and McCallum, Andrew and Pereira, Fernando CN},
+	year={2001}
+}
+
+
+@inproceedings{tkaczyk2014cermine,
+	title={Cermine--automatic extraction of metadata and references from scientific literature},
+	author={Tkaczyk, Dominika and Szostek, Pawel and Dendek, Piotr Jan and Fedoryszak, Mateusz and Bolikowski, Lukasz},
+	booktitle={2014 11th IAPR International Workshop on Document Analysis Systems},
+	pages={217--221},
+	year={2014},
+	organization={IEEE}
+}
+
+
+
+@inproceedings{hosseini2019excite,
+	title={EXCITE--A toolchain to extract, match and publish open literature references},
+	author={Hosseini, Azam and Ghavimi, Behnam and Boukhers, Zeyd and Mayr, Philipp},
+	booktitle={2019 ACM/IEEE Joint Conference on Digital Libraries (JCDL)},
+	pages={432--433},
+	year={2019},
+	organization={IEEE}
+}
+
+
+@inproceedings{pasula2003identity,
+	title={Identity uncertainty and citation matching},
+	author={Pasula, Hanna and Marthi, Bhaskara and Milch, Brian and Russell, Stuart J and Shpitser, Ilya},
+	booktitle={Advances in neural information processing systems},
+	pages={1425--1432},
+	year={2003}
+}
+
+@article{olensky2016evaluation,
+	title={Evaluation of the citation matching algorithms of CWTS and i FQ in comparison to the W eb of science},
+	author={Olensky, Marlies and Schmidt, Marion and van Eck, Nees Jan},
+	journal={Journal of the Association for Information Science and Technology},
+	volume={67},
+	number={10},
+	pages={2550--2564},
+	year={2016},
+	publisher={Wiley Online Library}
+}
+
+@article{mathiak2015challenges,
+	title={Challenges in matching dataset citation strings to datasets in social science},
+	author={Mathiak, Brigitte and Boland, Katarina},
+	journal={D-Lib Magazine},
+	volume={21},
+	number={1/2},
+	pages={23--28},
+	year={2015},
+	publisher={Corporation for National Research Initiatives}
+}
+
+@inproceedings{giles1998citeseer,
+	title={CiteSeer: An automatic citation indexing system},
+	author={Giles, C Lee and Bollacker, Kurt D and Lawrence, Steve},
+	booktitle={Proceedings of the third ACM conference on Digital libraries},
+	pages={89--98},
+	year={1998}
+}
+
-- 
cgit v1.2.3