aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-08-08 14:02:44 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-08-08 14:02:44 +0200
commit1c21e964b421e642fca4054f67ba7586bfcb683d (patch)
tree8d311c2b5234ef4e52604b62d602e9375d5734ef
parenteb057b6e2a1f74b9a74a164e13c9042332bb1244 (diff)
downloadrefcat-1c21e964b421e642fca4054f67ba7586bfcb683d.tar.gz
refcat-1c21e964b421e642fca4054f67ba7586bfcb683d.zip
wip: indent latex
-rw-r--r--docs/Simple/main.pdfbin92494 -> 92494 bytes
-rw-r--r--docs/Simple/main.tex304
2 files changed, 152 insertions, 152 deletions
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
index 9545257..71ab5ca 100644
--- a/docs/Simple/main.pdf
+++ b/docs/Simple/main.pdf
Binary files differ
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index 2c3001d..6871020 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -21,17 +21,17 @@
\title{Fatcat Reference Dataset}
\author{Martin Czygan \\
-\\
-Internet Archive \\
-San Francisco, California, USA \\
-martin@archive.org \\
-\and
-Bryan Newbold \\
-\\
-Internet Archive \\
-San Francisco, California, USA \\
-bnewbold@archive.org \\
-\\
+ \\
+ Internet Archive \\
+ San Francisco, California, USA \\
+ martin@archive.org \\
+ \and
+ Bryan Newbold \\
+ \\
+ Internet Archive \\
+ San Francisco, California, USA \\
+ bnewbold@archive.org \\
+ \\
}
@@ -40,18 +40,18 @@ bnewbold@archive.org \\
\begin{abstract}
-As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
-graph dataset, named \emph{refcat}, derived from scholarly publications and
-additional data sources. It is composed of data gathered by the fatcat
-cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
-crawls targeting primary and secondary scholarly outputs, as well as metadata
-from the Open Library\footnote{\url{https://openlibrary.org}} project and
-Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
-graph consists of 1,323,423,672 citations. We release this dataset under a CC0
-Public Domain Dedication, accessible through an archive
-collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
-code used in the derivation process is released under an MIT
-license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
+ As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
+ graph dataset, named \emph{refcat}, derived from scholarly publications and
+ additional data sources. It is composed of data gathered by the fatcat
+ cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
+ crawls targeting primary and secondary scholarly outputs, as well as metadata
+ from the Open Library\footnote{\url{https://openlibrary.org}} project and
+ Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
+ graph consists of 1,323,423,672 citations. We release this dataset under a CC0
+ Public Domain Dedication, accessible through an archive
+ collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
+ code used in the derivation process is released under an MIT
+ license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
\end{abstract}
\keywords{Citation Graph, Web Archiving}
@@ -99,7 +99,7 @@ publications\footnote{\url{http://wikicite.org/statistics.html}}.
Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of
entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
-\url{https://archive.org/details/mag-2021-06-07}} the
+ \url{https://archive.org/details/mag-2021-06-07}} the
\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
bibliographic entities.
@@ -132,25 +132,25 @@ The majority of DOI based matches between \emph{refcat} and COCI overlap, as can
seen in~Table~\ref{table:cocicmp}.
\begin{table}[]
- \begin{center}
- \begin{tabular}{ll}
-\toprule
-\bf{Set} & \bf{Count} \\
-
-\midrule
- COCI (C) & 1,094,394,688 \\
- \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
- C $\cap$ R & 1,007,539,966 \\
- C $\setminus$ R & 86,854,309 \\
- R $\setminus$ C & 295,884,246
- \end{tabular}
- \vspace*{2mm}
- \caption{Comparison between COCI and \emph{refcat-doi}, a subset of
-\emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246
-references only in \emph{refcat-doi} come from links between datasets (GBIF,
-DOI prefix: 10.15468).}
- \label{table:cocicmp}
- \end{center}
+ \begin{center}
+ \begin{tabular}{ll}
+ \toprule
+ \bf{Set} & \bf{Count} \\
+
+ \midrule
+ COCI (C) & 1,094,394,688 \\
+ \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+ C $\cap$ R & 1,007,539,966 \\
+ C $\setminus$ R & 86,854,309 \\
+ R $\setminus$ C & 295,884,246
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Comparison between COCI and \emph{refcat-doi}, a subset of
+ \emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246
+ references only in \emph{refcat-doi} come from links between datasets (GBIF,
+ DOI prefix: 10.15468).}
+ \label{table:cocicmp}
+ \end{center}
\end{table}
% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
@@ -178,32 +178,32 @@ from a set of eight field set manifestations, as listed in
Table~\ref{table:fields}.
\begin{table}[]
- \begin{center}
- \begin{tabular}{ll}
-\toprule
- \bf{Fields} & \bf{Percentage} \\
-\midrule
- \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
- \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
- \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
- \end{tabular}
- \vspace*{2mm}
- \caption{Top 8 combinations of available fields in raw reference data
- accounting for about 53\% of the total data (CN = container name, CRN =
-contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
-issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
- \label{table:fields}
-\end{center}
+ \begin{center}
+ \begin{tabular}{ll}
+ \toprule
+ \bf{Fields} & \bf{Percentage} \\
+ \midrule
+ \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\
+ \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\
+ \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Top 8 combinations of available fields in raw reference data
+ accounting for about 53\% of the total data (CN = container name, CRN =
+ contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+ issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+ \label{table:fields}
+ \end{center}
\end{table}
Overall, a map-reduce style\citep{dean2010mapreduce} approach is
followed\footnote{While the operations are similar, the processing is not
-distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
+ distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
for some
uniformity in the overall processing. We extract (key, document) tuples (as
TSV) from the raw JSON data and sort by key. We then group documents with the
@@ -229,8 +229,8 @@ candidate generation phase in order to improve recall, but we are strict during
verification, in order to control precision. Quality assurance for verification is
implemented through a growing list of test cases of real examples from the catalog and
their expected or desired match status\footnote{The list can be found under:
-\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
-It is helpful to keep this test suite independent of any specific programming language.}.
+ \url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
+ It is helpful to keep this test suite independent of any specific programming language.}.
\section{Limitations and Future Work}
@@ -238,32 +238,32 @@ It is helpful to keep this test suite independent of any specific programming la
As other dataset in this field we expect this dataset to be iterated upon.
\begin{itemize}
- \item The fatcat catalog updates its metadata
- continously\footnote{A changelog can currenly be followed here:
- \url{https://fatcat.wiki/changelog}} and web crawls are conducted
- regularly. Current processing pipelines cover raw reference snapshot
- creation and derivation the graph structure, which allows to rerun
- processing based on updated data as it becomes available.
-
- \item Metadata extraction from PDFs depends on supervised machine learning
- models, which in turn depends of available training sets. With additional crawls and
- metadata available we hope to improve models used for metadata
- extraction, improving yield and reducing data extraction artifacts in
- the process.
-
- \item As of this version, a number of raw reference
- docs remain unmatched, which means that neither exact nor fuzzy matching
- can detect a link to a known entity. On the one
- hand, this can hint at missing metadata. However, parts of the data
- will contain a reference to a catalogued entity, but in a specific,
- dense and harder to recover form.
- This also include improvements to the fuzzy matching approach.
- \end{itemize}
+ \item The fatcat catalog updates its metadata
+ continously\footnote{A changelog can currenly be followed here:
+ \url{https://fatcat.wiki/changelog}} and web crawls are conducted
+ regularly. Current processing pipelines cover raw reference snapshot
+ creation and derivation the graph structure, which allows to rerun
+ processing based on updated data as it becomes available.
+
+ \item Metadata extraction from PDFs depends on supervised machine learning
+ models, which in turn depends of available training sets. With additional crawls and
+ metadata available we hope to improve models used for metadata
+ extraction, improving yield and reducing data extraction artifacts in
+ the process.
+
+ \item As of this version, a number of raw reference
+ docs remain unmatched, which means that neither exact nor fuzzy matching
+ can detect a link to a known entity. On the one
+ hand, this can hint at missing metadata. However, parts of the data
+ will contain a reference to a catalogued entity, but in a specific,
+ dense and harder to recover form.
+ This also include improvements to the fuzzy matching approach.
+\end{itemize}
\section{Acknowledgements}
This work is partially supported by a grant from the \emph{Andrew W. Mellon
-Foundation}.
+ Foundation}.
\section{Appendix A}
@@ -276,69 +276,69 @@ especially for fuzzy matching to be able to zoom in on systematic errors
more easily (see~Table~\ref{table:matches}).
\begin{table}[]
- \footnotesize
- \captionsetup{font=normalsize}
- \begin{center}
-\begin{tabular}{@{}rlll@{}}
-\toprule
-\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule
-934932865 & crossref & exact & doi \\
-151366108 & fatcat-datacite & exact & doi \\
-65345275 & fatcat-pubmed & exact & pmid \\
-48778607 & fuzzy & strong & jaccardauthors \\
-42465250 & grobid & exact & doi \\
-29197902 & fatcat-pubmed & exact & doi \\
-19996327 & fatcat-crossref & exact & doi \\
-11996694 & fuzzy & strong & slugtitleauthormatch \\
-9157498 & fuzzy & strong & tokenizedauthors \\
-3547594 & grobid & exact & arxiv \\
-2310025 & fuzzy & exact & titleauthormatch \\
-1496515 & grobid & exact & pmid \\
-680722 & crossref & strong & jaccardauthors \\
-476331 & fuzzy & strong & versioneddoi \\
-449271 & grobid & exact & isbn \\
-230645 & fatcat-crossref & strong & jaccardauthors \\
-190578 & grobid & strong & jaccardauthors \\
-156657 & crossref & exact & isbn \\
-123681 & fatcat-pubmed & strong & jaccardauthors \\
-79328 & crossref & exact & arxiv \\
-57414 & crossref & strong & tokenizedauthors \\
-53480 & fuzzy & strong & pmiddoipair \\
-52453 & fuzzy & strong & dataciterelatedid \\
-47119 & grobid & strong & slugtitleauthormatch \\
-36774 & fuzzy & strong & arxivversion \\
-35311 & fuzzy & strong & customieeearxiv \\
-33863 & grobid & exact & pmcid \\
-23504 & crossref & strong & slugtitleauthormatch \\
-22753 & fatcat-crossref & strong & tokenizedauthors \\
-17720 & grobid & exact & titleauthormatch \\
-14656 & crossref & exact & titleauthormatch \\
-14438 & grobid & strong & tokenizedauthors \\
-7682 & fatcat-crossref & exact & arxiv \\
-5972 & fatcat-crossref & exact & isbn \\
-5525 & fatcat-pubmed & exact & arxiv \\
-4290 & fatcat-pubmed & strong & tokenizedauthors \\
-2745 & fatcat-pubmed & exact & isbn \\
-2342 & fatcat-pubmed & strong & slugtitleauthormatch \\
-2273 & fatcat-crossref & strong & slugtitleauthormatch \\
-1960 & fuzzy & exact & workid \\
-1150 & fatcat-crossref & exact & titleauthormatch \\
-1041 & fatcat-pubmed & exact & titleauthormatch \\
-895 & fuzzy & strong & figshareversion \\
-317 & fuzzy & strong & titleartifact \\
-82 & grobid & strong & titleartifact \\
-33 & crossref & strong & titleartifact \\
-5 & fuzzy & strong & custombsiundated \\
-1 & fuzzy & strong & custombsisubdoc \\
-1 & fatcat & exact & doi \\ \bottomrule
-\end{tabular}
- \vspace*{2mm}
- \caption{Table of match counts, reference provenance, match status and
-match reason. The match reason identifier encode a specific rule in the domain
-dependent verification process and are included for completeness - we do not
-include the details of each rule in this report.}
- \label{table:matches}
-\end{center}
+ \footnotesize
+ \captionsetup{font=normalsize}
+ \begin{center}
+ \begin{tabular}{@{}rlll@{}}
+ \toprule
+ \textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule
+ 934932865 & crossref & exact & doi \\
+ 151366108 & fatcat-datacite & exact & doi \\
+ 65345275 & fatcat-pubmed & exact & pmid \\
+ 48778607 & fuzzy & strong & jaccardauthors \\
+ 42465250 & grobid & exact & doi \\
+ 29197902 & fatcat-pubmed & exact & doi \\
+ 19996327 & fatcat-crossref & exact & doi \\
+ 11996694 & fuzzy & strong & slugtitleauthormatch \\
+ 9157498 & fuzzy & strong & tokenizedauthors \\
+ 3547594 & grobid & exact & arxiv \\
+ 2310025 & fuzzy & exact & titleauthormatch \\
+ 1496515 & grobid & exact & pmid \\
+ 680722 & crossref & strong & jaccardauthors \\
+ 476331 & fuzzy & strong & versioneddoi \\
+ 449271 & grobid & exact & isbn \\
+ 230645 & fatcat-crossref & strong & jaccardauthors \\
+ 190578 & grobid & strong & jaccardauthors \\
+ 156657 & crossref & exact & isbn \\
+ 123681 & fatcat-pubmed & strong & jaccardauthors \\
+ 79328 & crossref & exact & arxiv \\
+ 57414 & crossref & strong & tokenizedauthors \\
+ 53480 & fuzzy & strong & pmiddoipair \\
+ 52453 & fuzzy & strong & dataciterelatedid \\
+ 47119 & grobid & strong & slugtitleauthormatch \\
+ 36774 & fuzzy & strong & arxivversion \\
+ 35311 & fuzzy & strong & customieeearxiv \\
+ 33863 & grobid & exact & pmcid \\
+ 23504 & crossref & strong & slugtitleauthormatch \\
+ 22753 & fatcat-crossref & strong & tokenizedauthors \\
+ 17720 & grobid & exact & titleauthormatch \\
+ 14656 & crossref & exact & titleauthormatch \\
+ 14438 & grobid & strong & tokenizedauthors \\
+ 7682 & fatcat-crossref & exact & arxiv \\
+ 5972 & fatcat-crossref & exact & isbn \\
+ 5525 & fatcat-pubmed & exact & arxiv \\
+ 4290 & fatcat-pubmed & strong & tokenizedauthors \\
+ 2745 & fatcat-pubmed & exact & isbn \\
+ 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\
+ 2273 & fatcat-crossref & strong & slugtitleauthormatch \\
+ 1960 & fuzzy & exact & workid \\
+ 1150 & fatcat-crossref & exact & titleauthormatch \\
+ 1041 & fatcat-pubmed & exact & titleauthormatch \\
+ 895 & fuzzy & strong & figshareversion \\
+ 317 & fuzzy & strong & titleartifact \\
+ 82 & grobid & strong & titleartifact \\
+ 33 & crossref & strong & titleartifact \\
+ 5 & fuzzy & strong & custombsiundated \\
+ 1 & fuzzy & strong & custombsisubdoc \\
+ 1 & fatcat & exact & doi \\ \bottomrule
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Table of match counts, reference provenance, match status and
+ match reason. The match reason identifier encode a specific rule in the domain
+ dependent verification process and are included for completeness - we do not
+ include the details of each rule in this report.}
+ \label{table:matches}
+ \end{center}
\end{table}
\bibliographystyle{abbrv}