From 171678d962a49d5ae05e586702d09ccac3f08525 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 6 Sep 2021 18:40:54 +0200 Subject: docs: related work --- docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 96426 -> 97899 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 86 +++++++++++++++++--------- docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib | 76 +++++++++++++++++++++++ 3 files changed, 133 insertions(+), 29 deletions(-) (limited to 'docs') diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf index 9fafcc0..6ff65d0 100644 Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index ab72699..2a60a77 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -87,9 +87,9 @@ Initiative for Open Citations\footnote{\url{https://i4oc.org}}~\citep{shotton201 In 2021, over one billion citations are publicly available, marking a ``tipping point'' for this category of data~\citep{hutchins2021tipping}. -While a paper will mainly cite other papers, more citable entities exist such -as books and web links and within links a variety of targets, such as web -sites, reference entries, protocols or datasets. References can be extracted +While a paper will often cite other papers, more citable entities exist such +as books or web links and within links a variety of targets, such as web +pages, reference entries, protocols or datasets. References can be extracted manually or through more automated methods, such as metadata access and structured data extraction from full text documents; the latter offering the benefits of scalability. The completeness of bibliographic metadata ranges from @@ -98,32 +98,60 @@ strings partially describing a publication. \section{Related Work} -There are a few large scale citation dataset available today. COCI, the -``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first -released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on -2021-07-29, it contains -1,094,394,688 citations across 65,835,422 bibliographic -resources~\citep{peroni2020opencitations}. - -The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project, -``a Wikimedia initiative to develop open citations and linked bibliographic -data to serve free knowledge'' continously adds citations to its database and -as of 2021-06-28 tracks 253,719,394 citations across 39,994,937 -publications\footnote{\url{http://wikicite.org/statistics.html}}. - -Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of -entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}} -with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at - \url{https://archive.org/details/mag-2021-06-07}} the -\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466 -bibliographic entities. - -Numerous other projects have been or are concerned with various aspects of -citation discovery and curation as part their feature set, among them Semantic -Scholar~\citep{fricke2018semantic}, CiteSeerX~\citep{li2006citeseerx} or Aminer~\citep{tang2016aminer}. - -As mentioned in~\citep{hutchins2021tipping}, the number of openly available -citations is not expected to shrink in the future. +Typical problems arising in the process of compiling a citation graph dataset +are data aquisition and citation matching. Data acquisition itself can take +different forms: bibliographic metadata can contain explicit reference data as +provided by publishers and aggregators; this data can be relatively consistent +when looked at per source, but may vary in style and comprehensiveness when +looked at as a whole. Another way of acquiring bibliographic metadata is to +analyze a source document, such as a PDF (or its text), directly. Tools in this +category are often based on conditial random +fields~\citep{lafferty2001conditional} and have been implemented in projects +such as ParsCit~\citep{councill2008parscit}, +Cermine~\citep{tkaczyk2014cermine}, EXCITE~\citep{hosseini2019excite} +or GROBID~\citep{lopez2009grobid}. + +The problem of citation matching is relatively simple when common, persistent +identifiers are present in the data. Complications mount, when there is +\emph{Identity Uncertainty}, that is ``objects are not labeled with unique +identifiers or when those identifiers may not be perceived +perfectly''~\citep{pasula2003identity}. CiteSeer has been an early project +concerned with citation matching~\citep{giles1998citeseer}. A taxonomy of +potential issues common in the matching process has been compiled +by~\citep{olensky2016evaluation}. Additional care is required, when the +citation matching process is done at scale~\citep{fedoryszak2013large}. The +problem of heterogenity has been discussed in the context of datasets +by~\citep{mathiak2015challenges}. + + + + +% There are a few large scale citation dataset available today. COCI, the +% ``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first +% released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on +% 2021-07-29, it contains +% 1,094,394,688 citations across 65,835,422 bibliographic +% resources~\citep{peroni2020opencitations}. +% +% The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project, +% ``a Wikimedia initiative to develop open citations and linked bibliographic +% data to serve free knowledge'' continously adds citations to its database and +% as of 2021-06-28 tracks 253,719,394 citations across 39,994,937 +% publications\footnote{\url{http://wikicite.org/statistics.html}}. +% +% Microsoft Academic Graph~\citep{sinha2015overview} is comprised of a number of +% entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}} +% with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at +% \url{https://archive.org/details/mag-2021-06-07}} the +% \emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466 +% bibliographic entities. +% +% Numerous other projects have been or are concerned with various aspects of +% citation discovery and curation as part their feature set, among them Semantic +% Scholar~\citep{fricke2018semantic}, CiteSeerX~\citep{li2006citeseerx} or Aminer~\citep{tang2016aminer}. +% +% As mentioned in~\citep{hutchins2021tipping}, the number of openly available +% citations is not expected to shrink in the future. \section{Dataset} diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib index f927ea4..9cfb32b 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib @@ -271,3 +271,79 @@ year={2014}, month={May} } + +@inproceedings{councill2008parscit, + title={ParsCit: an Open-source CRF Reference String Parsing Package.}, + author={Councill, Isaac G and Giles, C Lee and Kan, Min-Yen}, + booktitle={LREC}, + volume={8}, + pages={661--667}, + year={2008} +} + +@article{lafferty2001conditional, + title={Conditional random fields: Probabilistic models for segmenting and labeling sequence data}, + author={Lafferty, John and McCallum, Andrew and Pereira, Fernando CN}, + year={2001} +} + + +@inproceedings{tkaczyk2014cermine, + title={Cermine--automatic extraction of metadata and references from scientific literature}, + author={Tkaczyk, Dominika and Szostek, Pawel and Dendek, Piotr Jan and Fedoryszak, Mateusz and Bolikowski, Lukasz}, + booktitle={2014 11th IAPR International Workshop on Document Analysis Systems}, + pages={217--221}, + year={2014}, + organization={IEEE} +} + + + +@inproceedings{hosseini2019excite, + title={EXCITE--A toolchain to extract, match and publish open literature references}, + author={Hosseini, Azam and Ghavimi, Behnam and Boukhers, Zeyd and Mayr, Philipp}, + booktitle={2019 ACM/IEEE Joint Conference on Digital Libraries (JCDL)}, + pages={432--433}, + year={2019}, + organization={IEEE} +} + + +@inproceedings{pasula2003identity, + title={Identity uncertainty and citation matching}, + author={Pasula, Hanna and Marthi, Bhaskara and Milch, Brian and Russell, Stuart J and Shpitser, Ilya}, + booktitle={Advances in neural information processing systems}, + pages={1425--1432}, + year={2003} +} + +@article{olensky2016evaluation, + title={Evaluation of the citation matching algorithms of CWTS and i FQ in comparison to the W eb of science}, + author={Olensky, Marlies and Schmidt, Marion and van Eck, Nees Jan}, + journal={Journal of the Association for Information Science and Technology}, + volume={67}, + number={10}, + pages={2550--2564}, + year={2016}, + publisher={Wiley Online Library} +} + +@article{mathiak2015challenges, + title={Challenges in matching dataset citation strings to datasets in social science}, + author={Mathiak, Brigitte and Boland, Katarina}, + journal={D-Lib Magazine}, + volume={21}, + number={1/2}, + pages={23--28}, + year={2015}, + publisher={Corporation for National Research Initiatives} +} + +@inproceedings{giles1998citeseer, + title={CiteSeer: An automatic citation indexing system}, + author={Giles, C Lee and Bollacker, Kurt D and Lawrence, Steve}, + booktitle={Proceedings of the third ACM conference on Digital libraries}, + pages={89--98}, + year={1998} +} + -- cgit v1.2.3