From 9422fc5314ce6d1ba64cf32b0b88c9b76e96a0bd Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 18 Aug 2021 17:48:00 -0700 Subject: report; add potential some potential refs to .bib file --- docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib | 44 +++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib index 51a2f58..bcb09ec 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib @@ -226,3 +226,47 @@ year={2018} } +@book{ortega2014academic, + title={Academic search engines: A quantitative outlook}, + author={Ortega, Jos{\'e} Luis}, + year={2014}, + publisher={Elsevier} +} + +@article{fedoryszak2014efficient, + title={Efficient blocking method for a large scale citation matching}, + author={Fedoryszak, Mateusz and Bolikowski, {\L}ukasz}, + journal={D-Lib Magazine}, + volume={20}, + number={11/12}, + year={2014}, + publisher={Corporation for National Research Initiatives} +} + +@inproceedings{fedoryszak2013large, + title={Large scale citation matching using Apache Hadoop}, + author={Fedoryszak, Mateusz and Tkaczyk, Dominika and Bolikowski, {\L}ukasz}, + booktitle={International Conference on Theory and Practice of Digital Libraries}, + pages={362--365}, + year={2013}, + organization={Springer} +} + +@article{hendricks2020crossref, + title={Crossref: The sustainable source of community-owned scholarly metadata}, + author={Hendricks, Ginny and Tkaczyk, Dominika and Lin, Jennifer and Feeney, Patricia}, + journal={Quantitative Science Studies}, + volume={1}, + number={1}, + pages={414--427}, + year={2020}, + publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} +} + +@inproceedings{tkaczyk2018machine, + title={Machine learning vs. rules and out-of-the-box vs. retrained: An evaluation of open-source bibliographic reference and citation parsers}, + author={Tkaczyk, Dominika and Collins, Andrew and Sheridan, Paraic and Beel, Joeran}, + booktitle={Proceedings of the 18th ACM/IEEE on joint conference on digital libraries}, + pages={99--108}, + year={2018} +} -- cgit v1.2.3 From 8fb454839903ffd539438df90a79904322e25da5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 18 Aug 2021 17:49:04 -0700 Subject: report: title; fix field name; capitalization --- docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index 76f1456..21917fd 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -18,7 +18,7 @@ \begin{document} -\title{Fatcat Reference Dataset} +\title{REFCAT: The Fatcat Citation Graph} \author{Martin Czygan \\ \\ @@ -117,7 +117,7 @@ citations is not expected to shrink in the future. We release the first version of the \emph{refcat} dataset in an format used internally for storage and to serve queries (and which we call \emph{biblioref} or \emph{bref} for short). The dataset includes metadata from fatcat, the -Open Library Project and inbound links from the English Wikipedia. The fatcat +Open Library project and inbound links from the English Wikipedia. The fatcat project itself aggregates data from variety of open data sources, such as Crossref\citep{crossref}, PubMed\citep{canese2013pubmed}, DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others, @@ -196,7 +196,7 @@ Table~\ref{table:fields}. \toprule \bf{Fields} & \bf{Percentage} \\ \midrule - \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ @@ -225,7 +225,7 @@ our target schema or perform additional operations such as deduplication or fusion of matched and unmatched references. The key derivation can be exact (via an identifier like DOI, PMID, etc) or -based on a value normalization, like slugifying a title string. For identifier +based on a value normalization, like ``slugifying'' a title string. For identifier based matches we can generate the target schema directly. For fuzzy matching candidates, we pass possible match pairs through a verification procedure, which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a -- cgit v1.2.3 From 90b90d45ce2ef0318d563cb4423f91de50f1a172 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 18 Aug 2021 17:51:16 -0700 Subject: report: turn a bunch of citations to footnote URLs --- docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 16 +++++++------- docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib | 30 --------------------------- 2 files changed, 8 insertions(+), 38 deletions(-) diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index 21917fd..a5536d8 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -75,12 +75,12 @@ were first devised, living on in existing commercial knowledge bases today. Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 - the first version of which contained 6,325,178 individual references\citep{shotton2013publishing}. Other notable early projects -include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last +include CiteSeerX\citep{wu2019citeseerx} and CitEc\footnote{\url{https://citec.repec.org}}. The last decade has seen the emergence of more openly available, large scale citation projects like Microsoft Academic\citep{sinha2015overview} or the -Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021, -according to \citep{hutchins2021tipping} over 1B citations are publicly -available, marking a tipping point for this category of data. +Initiative for Open Citations\footnote{\url{https://i4oc.org}}\citep{shotton2018funders}. +In 2021, over one billion citations are publicly available, marking a ``tipping point'' +for this category of data\citep{hutchins2021tipping}. \section{Related Work} @@ -119,14 +119,14 @@ internally for storage and to serve queries (and which we call \emph{biblioref} or \emph{bref} for short). The dataset includes metadata from fatcat, the Open Library project and inbound links from the English Wikipedia. The fatcat project itself aggregates data from variety of open data sources, such as -Crossref\citep{crossref}, PubMed\citep{canese2013pubmed}, -DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others, +Crossref\footnote{\url{https://crossref.org}}, PubMed\footnote{\url{https://pubmed.ncbi.nlm.nih.gov/}}, +DataCite\footnote{\url{https://datacite.org}}, Directory of Open Access Jourals (DOAJ)\footnote{\url{https://doaj.org}}, dblp\citep{ley2002dblp} and others, as well as metadata generated from analysis of data preserved at the Internet Archive and active crawls of publication sites on the web. The dataset is integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users -to explore inbound and outbound references\cite{fatcatguidereferencegraph}. +to explore inbound and outbound references\footnote{\url{https://guide.fatcat.wiki/reference_graph.html}}. The format records source and target (fatcat release and work) identifiers, a few attributes from the metadata (such as year or release stage) as well as @@ -196,7 +196,7 @@ Table~\ref{table:fields}. \toprule \bf{Fields} & \bf{Percentage} \\ \midrule - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib index bcb09ec..33b5360 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib @@ -64,12 +64,6 @@ year={2013} } -@misc{CitEc, - title = {Citations in Economics}, - howpublished = {\url{https://citec.repec.org/}}, - note = {Accessed: 2021-07-30} -} - @inproceedings{wu2019citeseerx, title={CiteSeerX: 20 years of service to scholarly big data}, author={Wu, Jian and Kim, Kunho and Giles, C Lee}, @@ -95,30 +89,6 @@ year={2015} } -@misc{i4oc, - title = {Initiative for Open Citations}, - howpublished = {\url{https://i4oc.org/}}, - note = {Accessed: 2021-07-30} -} - -@misc{fatcatguidereferencegraph, - title = {The Fatcat Guide: Reference Graph (refcat)}, - howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}}, - note = {Accessed: 2021-08-08} -} - -@misc{crossref, - title = {Crossref}, - howpublished = {\url{https://crossref.org}}, - note = {Accessed: 2021-08-08} -} - -@misc{doaj, - title = {Directory of Open Access Journals}, - howpublished = {\url{https://doaj.org}}, - note = {Accessed: 2021-08-08} -} - @inproceedings{ley2002dblp, title={The DBLP computer science bibliography: Evolution, research issues, perspectives}, author={Ley, Michael}, -- cgit v1.2.3 From cb6d5f3b17a201d57b26bea11b2728300c370c2c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 18 Aug 2021 17:51:38 -0700 Subject: report: bib reformatted --- docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib | 60 +++++++++++++-------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib index 33b5360..4dd7f6a 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib @@ -197,46 +197,46 @@ } @book{ortega2014academic, - title={Academic search engines: A quantitative outlook}, - author={Ortega, Jos{\'e} Luis}, - year={2014}, - publisher={Elsevier} + title={Academic search engines: A quantitative outlook}, + author={Ortega, Jos{\'e} Luis}, + year={2014}, + publisher={Elsevier} } @article{fedoryszak2014efficient, - title={Efficient blocking method for a large scale citation matching}, - author={Fedoryszak, Mateusz and Bolikowski, {\L}ukasz}, - journal={D-Lib Magazine}, - volume={20}, - number={11/12}, - year={2014}, - publisher={Corporation for National Research Initiatives} + title={Efficient blocking method for a large scale citation matching}, + author={Fedoryszak, Mateusz and Bolikowski, {\L}ukasz}, + journal={D-Lib Magazine}, + volume={20}, + number={11/12}, + year={2014}, + publisher={Corporation for National Research Initiatives} } @inproceedings{fedoryszak2013large, - title={Large scale citation matching using Apache Hadoop}, - author={Fedoryszak, Mateusz and Tkaczyk, Dominika and Bolikowski, {\L}ukasz}, - booktitle={International Conference on Theory and Practice of Digital Libraries}, - pages={362--365}, - year={2013}, - organization={Springer} + title={Large scale citation matching using Apache Hadoop}, + author={Fedoryszak, Mateusz and Tkaczyk, Dominika and Bolikowski, {\L}ukasz}, + booktitle={International Conference on Theory and Practice of Digital Libraries}, + pages={362--365}, + year={2013}, + organization={Springer} } @article{hendricks2020crossref, - title={Crossref: The sustainable source of community-owned scholarly metadata}, - author={Hendricks, Ginny and Tkaczyk, Dominika and Lin, Jennifer and Feeney, Patricia}, - journal={Quantitative Science Studies}, - volume={1}, - number={1}, - pages={414--427}, - year={2020}, - publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} + title={Crossref: The sustainable source of community-owned scholarly metadata}, + author={Hendricks, Ginny and Tkaczyk, Dominika and Lin, Jennifer and Feeney, Patricia}, + journal={Quantitative Science Studies}, + volume={1}, + number={1}, + pages={414--427}, + year={2020}, + publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} } @inproceedings{tkaczyk2018machine, - title={Machine learning vs. rules and out-of-the-box vs. retrained: An evaluation of open-source bibliographic reference and citation parsers}, - author={Tkaczyk, Dominika and Collins, Andrew and Sheridan, Paraic and Beel, Joeran}, - booktitle={Proceedings of the 18th ACM/IEEE on joint conference on digital libraries}, - pages={99--108}, - year={2018} + title={Machine learning vs. rules and out-of-the-box vs. retrained: An evaluation of open-source bibliographic reference and citation parsers}, + author={Tkaczyk, Dominika and Collins, Andrew and Sheridan, Paraic and Beel, Joeran}, + booktitle={Proceedings of the 18th ACM/IEEE on joint conference on digital libraries}, + pages={99--108}, + year={2018} } -- cgit v1.2.3