wip: indent latex

author: Martin Czygan <martin.czygan@gmail.com> 2021-08-08 14:02:44 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-08-08 14:02:44 +0200
commit: 1c21e964b421e642fca4054f67ba7586bfcb683d (patch)
tree: 8d311c2b5234ef4e52604b62d602e9375d5734ef
parent: eb057b6e2a1f74b9a74a164e13c9042332bb1244 (diff)
download: refcat-1c21e964b421e642fca4054f67ba7586bfcb683d.tar.gz
refcat-1c21e964b421e642fca4054f67ba7586bfcb683d.zip
2 files changed, 152 insertions, 152 deletions
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf
index 9545257..71ab5ca 100644
--- a/docs/Simple/main.pdf
+++ b/docs/Simple/main.pdf
diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex
index 2c3001d..6871020 100644
--- a/docs/Simple/main.tex
+++ b/docs/Simple/main.tex
@@ -21,17 +21,17 @@
 \title{Fatcat Reference Dataset}
 
 \author{Martin Czygan \\
-\\
-Internet Archive \\
-San Francisco, California, USA \\
-martin@archive.org  \\
-\and
-Bryan Newbold \\
-\\
-Internet Archive \\
-San Francisco, California, USA \\
-bnewbold@archive.org  \\
-\\
+	\\
+	Internet Archive \\
+	San Francisco, California, USA \\
+	martin@archive.org  \\
+	\and
+	Bryan Newbold \\
+	\\
+	Internet Archive \\
+	San Francisco, California, USA \\
+	bnewbold@archive.org  \\
+	\\
 }
 
 
@@ -40,18 +40,18 @@ bnewbold@archive.org  \\
 
 
 \begin{abstract}
-As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
-graph dataset, named \emph{refcat}, derived from scholarly publications and
-additional data sources. It is composed of data gathered by the fatcat
-cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
-crawls targeting primary and secondary scholarly outputs, as well as metadata
-from the Open Library\footnote{\url{https://openlibrary.org}} project and
-Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
-graph consists of 1,323,423,672 citations. We release this dataset under a CC0
-Public Domain Dedication, accessible through an archive
-collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
-code used in the derivation process is released under an MIT
-license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
+	As part of its scholarly data efforts, the Internet Archive releases a first version of a citation
+	graph dataset, named \emph{refcat}, derived from scholarly publications and
+	additional data sources. It is composed of data gathered by the fatcat
+	cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale
+	crawls targeting primary and secondary scholarly outputs, as well as metadata
+	from the Open Library\footnote{\url{https://openlibrary.org}} project and
+	Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the
+	graph consists of 1,323,423,672 citations. We release this dataset under a CC0
+	Public Domain Dedication, accessible through an archive
+	collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All
+	code used in the derivation process is released under an MIT
+	license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}.
 \end{abstract}
 
 \keywords{Citation Graph, Web Archiving}
@@ -99,7 +99,7 @@ publications\footnote{\url{http://wikicite.org/statistics.html}}.
 Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of
 entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}}
 with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at
-\url{https://archive.org/details/mag-2021-06-07}}  the
+	\url{https://archive.org/details/mag-2021-06-07}}  the
 \emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466
 bibliographic entities.
 
@@ -132,25 +132,25 @@ The majority of DOI based matches between \emph{refcat} and COCI overlap, as can
 seen in~Table~\ref{table:cocicmp}.
 
 \begin{table}[]
-    \begin{center}
-    \begin{tabular}{ll}
-\toprule
-\bf{Set}          & \bf{Count} \\
-
-\midrule
-        COCI (C)        &   1,094,394,688    \\
-        \emph{refcat-doi} (R)   &   1,303,424,212    \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
-        C $\cap$ R      &   1,007,539,966    \\
-        C $\setminus$ R &      86,854,309  \\
-        R $\setminus$ C & 295,884,246
-    \end{tabular}
-    \vspace*{2mm}
-	\caption{Comparison between COCI and \emph{refcat-doi}, a subset of
-\emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246
-references only in \emph{refcat-doi} come from links between datasets (GBIF,
-DOI prefix: 10.15468).}
-     \label{table:cocicmp}
-    \end{center}
+	\begin{center}
+		\begin{tabular}{ll}
+			\toprule
+			\bf{Set}              & \bf{Count}    \\
+
+			\midrule
+			COCI (C)              & 1,094,394,688 \\
+			\emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
+			C $\cap$ R            & 1,007,539,966 \\
+			C $\setminus$ R       & 86,854,309    \\
+			R $\setminus$ C       & 295,884,246
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Comparison between COCI and \emph{refcat-doi}, a subset of
+			\emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246
+			references only in \emph{refcat-doi} come from links between datasets (GBIF,
+			DOI prefix: 10.15468).}
+		\label{table:cocicmp}
+	\end{center}
 \end{table}
 
 % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst
@@ -178,32 +178,32 @@ from a set of eight field set manifestations, as listed in
 Table~\ref{table:fields}.
 
 \begin{table}[]
-    \begin{center}
-    \begin{tabular}{ll}
-\toprule
-        \bf{Fields}                                    & \bf{Percentage} \\
-\midrule
-    \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}    & 14\%                              \\
-    \multicolumn{1}{l}{\textbf{DOI}}                 & 14\%                              \\
-        \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%                               \\
-        \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}    & 4\%                               \\
-        \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}              & 4\%                               \\
-        \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}    & 4\%                               \\
-        \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}            & 4\%                               \\
-        \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}      & 4\%                               \\
-    \end{tabular}
-    \vspace*{2mm}
-    \caption{Top 8 combinations of available fields in raw reference data
-        accounting for about 53\% of the total data (CN = container name, CRN =
-contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
-issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
-    \label{table:fields}
-\end{center}
+	\begin{center}
+		\begin{tabular}{ll}
+			\toprule
+			\bf{Fields}                                                                                     & \bf{Percentage} \\
+			\midrule
+			\multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$  U $\cdot$  V $\cdot$ Y}           & 14\%            \\
+			\multicolumn{1}{l}{\textbf{DOI}}                                                                & 14\%            \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y}           & 4\%             \\
+			\multicolumn{1}{l}{\textbf{PMID} $\cdot$ U}                                                     & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y}           & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y}                                                    & 4\%             \\
+			\multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y}                     & 4\%             \\
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Top 8 combinations of available fields in raw reference data
+			accounting for about 53\% of the total data (CN = container name, CRN =
+			contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS =
+			issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.}
+		\label{table:fields}
+	\end{center}
 \end{table}
 
 Overall, a map-reduce style\citep{dean2010mapreduce} approach is
 followed\footnote{While the operations are similar, the processing is not
-distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
+	distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows
 for some
 uniformity in the overall processing. We extract (key, document) tuples (as
 TSV) from the raw JSON data and sort by key. We then group documents with the
@@ -229,8 +229,8 @@ candidate generation phase in order to improve recall, but we are strict during
 verification, in order to control precision. Quality assurance for verification is
 implemented through a growing list of test cases of real examples from the catalog and
 their expected or desired match status\footnote{The list can be found under:
-\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
-It is helpful to keep this test suite independent of any specific programming language.}.
+	\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}.
+	It is helpful to keep this test suite independent of any specific programming language.}.
 
 
 \section{Limitations and Future Work}
@@ -238,32 +238,32 @@ It is helpful to keep this test suite independent of any specific programming la
 As other dataset in this field we expect this dataset to be iterated upon.
 
 \begin{itemize}
-    \item The fatcat catalog updates its metadata
-        continously\footnote{A changelog can currenly be followed here:
-        \url{https://fatcat.wiki/changelog}} and web crawls are conducted
-        regularly.  Current processing pipelines cover raw reference snapshot
-        creation and derivation the graph structure, which allows to rerun
-        processing based on updated data as it becomes available.
-
-    \item Metadata extraction from PDFs depends on supervised machine learning
-        models, which in turn depends of available training sets. With additional crawls and
-        metadata available we hope to improve models used for metadata
-        extraction, improving yield and reducing data extraction artifacts in
-        the process.
-
-    \item As of this version, a number of raw reference
-        docs remain unmatched, which means that neither exact nor fuzzy matching
-        can detect a link to a known entity. On the one
-        hand, this can hint at missing metadata. However, parts of the data
-        will contain a reference to a catalogued entity, but in a specific,
-        dense and harder to recover form.
-        This also include improvements to the fuzzy matching approach.
-    \end{itemize}
+	\item The fatcat catalog updates its metadata
+	      continously\footnote{A changelog can currenly be followed here:
+		      \url{https://fatcat.wiki/changelog}} and web crawls are conducted
+	      regularly.  Current processing pipelines cover raw reference snapshot
+	      creation and derivation the graph structure, which allows to rerun
+	      processing based on updated data as it becomes available.
+
+	\item Metadata extraction from PDFs depends on supervised machine learning
+	      models, which in turn depends of available training sets. With additional crawls and
+	      metadata available we hope to improve models used for metadata
+	      extraction, improving yield and reducing data extraction artifacts in
+	      the process.
+
+	\item As of this version, a number of raw reference
+	      docs remain unmatched, which means that neither exact nor fuzzy matching
+	      can detect a link to a known entity. On the one
+	      hand, this can hint at missing metadata. However, parts of the data
+	      will contain a reference to a catalogued entity, but in a specific,
+	      dense and harder to recover form.
+	      This also include improvements to the fuzzy matching approach.
+\end{itemize}
 
 \section{Acknowledgements}
 
 This work is partially supported by a grant from the \emph{Andrew W. Mellon
-Foundation}.
+	Foundation}.
 
 
 \section{Appendix A}
@@ -276,69 +276,69 @@ especially for fuzzy matching to be able to zoom in on systematic errors
 more easily (see~Table~\ref{table:matches}).
 
 \begin{table}[]
-    \footnotesize
-    \captionsetup{font=normalsize}
-    \begin{center}
-\begin{tabular}{@{}rlll@{}}
-\toprule
-\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule
-934932865                  & crossref                  & exact                 & doi                   \\
-151366108                  & fatcat-datacite           & exact                 & doi                   \\
-65345275                   & fatcat-pubmed             & exact                 & pmid                  \\
-48778607                   & fuzzy                     & strong                & jaccardauthors        \\
-42465250                   & grobid                    & exact                 & doi                   \\
-29197902                   & fatcat-pubmed             & exact                 & doi                   \\
-19996327                   & fatcat-crossref           & exact                 & doi                   \\
-11996694                   & fuzzy                     & strong                & slugtitleauthormatch  \\
-9157498                    & fuzzy                     & strong                & tokenizedauthors      \\
-3547594                    & grobid                    & exact                 & arxiv                 \\
-2310025                    & fuzzy                     & exact                 & titleauthormatch      \\
-1496515                    & grobid                    & exact                 & pmid                  \\
-680722                     & crossref                  & strong                & jaccardauthors        \\
-476331                     & fuzzy                     & strong                & versioneddoi          \\
-449271                     & grobid                    & exact                 & isbn                  \\
-230645                     & fatcat-crossref           & strong                & jaccardauthors        \\
-190578                     & grobid                    & strong                & jaccardauthors        \\
-156657                     & crossref                  & exact                 & isbn                  \\
-123681                     & fatcat-pubmed             & strong                & jaccardauthors        \\
-79328                      & crossref                  & exact                 & arxiv                 \\
-57414                      & crossref                  & strong                & tokenizedauthors      \\
-53480                      & fuzzy                     & strong                & pmiddoipair           \\
-52453                      & fuzzy                     & strong                & dataciterelatedid     \\
-47119                      & grobid                    & strong                & slugtitleauthormatch  \\
-36774                      & fuzzy                     & strong                & arxivversion          \\
-35311                      & fuzzy                     & strong                & customieeearxiv       \\
-33863                      & grobid                    & exact                 & pmcid                 \\
-23504                      & crossref                  & strong                & slugtitleauthormatch  \\
-22753                      & fatcat-crossref           & strong                & tokenizedauthors      \\
-17720                      & grobid                    & exact                 & titleauthormatch      \\
-14656                      & crossref                  & exact                 & titleauthormatch      \\
-14438                      & grobid                    & strong                & tokenizedauthors      \\
-7682                       & fatcat-crossref           & exact                 & arxiv                 \\
-5972                       & fatcat-crossref           & exact                 & isbn                  \\
-5525                       & fatcat-pubmed             & exact                 & arxiv                 \\
-4290                       & fatcat-pubmed             & strong                & tokenizedauthors      \\
-2745                       & fatcat-pubmed             & exact                 & isbn                  \\
-2342                       & fatcat-pubmed             & strong                & slugtitleauthormatch  \\
-2273                       & fatcat-crossref           & strong                & slugtitleauthormatch  \\
-1960                       & fuzzy                     & exact                 & workid                \\
-1150                       & fatcat-crossref           & exact                 & titleauthormatch      \\
-1041                       & fatcat-pubmed             & exact                 & titleauthormatch      \\
-895                        & fuzzy                     & strong                & figshareversion       \\
-317                        & fuzzy                     & strong                & titleartifact         \\
-82                         & grobid                    & strong                & titleartifact         \\
-33                         & crossref                  & strong                & titleartifact         \\
-5                          & fuzzy                     & strong                & custombsiundated      \\
-1                          & fuzzy                     & strong                & custombsisubdoc       \\
-1                          & fatcat                    & exact                 & doi                   \\ \bottomrule
-\end{tabular}
-    \vspace*{2mm}
-    \caption{Table of match counts, reference provenance, match status and
-match reason. The match reason identifier encode a specific rule in the domain
-dependent verification process and are included for completeness - we do not
-include the details of each rule in this report.}
-    \label{table:matches}
-\end{center}
+	\footnotesize
+	\captionsetup{font=normalsize}
+	\begin{center}
+		\begin{tabular}{@{}rlll@{}}
+			\toprule
+			\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason}      \\ \midrule
+			934932865      & crossref            & exact           & doi                  \\
+			151366108      & fatcat-datacite     & exact           & doi                  \\
+			65345275       & fatcat-pubmed       & exact           & pmid                 \\
+			48778607       & fuzzy               & strong          & jaccardauthors       \\
+			42465250       & grobid              & exact           & doi                  \\
+			29197902       & fatcat-pubmed       & exact           & doi                  \\
+			19996327       & fatcat-crossref     & exact           & doi                  \\
+			11996694       & fuzzy               & strong          & slugtitleauthormatch \\
+			9157498        & fuzzy               & strong          & tokenizedauthors     \\
+			3547594        & grobid              & exact           & arxiv                \\
+			2310025        & fuzzy               & exact           & titleauthormatch     \\
+			1496515        & grobid              & exact           & pmid                 \\
+			680722         & crossref            & strong          & jaccardauthors       \\
+			476331         & fuzzy               & strong          & versioneddoi         \\
+			449271         & grobid              & exact           & isbn                 \\
+			230645         & fatcat-crossref     & strong          & jaccardauthors       \\
+			190578         & grobid              & strong          & jaccardauthors       \\
+			156657         & crossref            & exact           & isbn                 \\
+			123681         & fatcat-pubmed       & strong          & jaccardauthors       \\
+			79328          & crossref            & exact           & arxiv                \\
+			57414          & crossref            & strong          & tokenizedauthors     \\
+			53480          & fuzzy               & strong          & pmiddoipair          \\
+			52453          & fuzzy               & strong          & dataciterelatedid    \\
+			47119          & grobid              & strong          & slugtitleauthormatch \\
+			36774          & fuzzy               & strong          & arxivversion         \\
+			35311          & fuzzy               & strong          & customieeearxiv      \\
+			33863          & grobid              & exact           & pmcid                \\
+			23504          & crossref            & strong          & slugtitleauthormatch \\
+			22753          & fatcat-crossref     & strong          & tokenizedauthors     \\
+			17720          & grobid              & exact           & titleauthormatch     \\
+			14656          & crossref            & exact           & titleauthormatch     \\
+			14438          & grobid              & strong          & tokenizedauthors     \\
+			7682           & fatcat-crossref     & exact           & arxiv                \\
+			5972           & fatcat-crossref     & exact           & isbn                 \\
+			5525           & fatcat-pubmed       & exact           & arxiv                \\
+			4290           & fatcat-pubmed       & strong          & tokenizedauthors     \\
+			2745           & fatcat-pubmed       & exact           & isbn                 \\
+			2342           & fatcat-pubmed       & strong          & slugtitleauthormatch \\
+			2273           & fatcat-crossref     & strong          & slugtitleauthormatch \\
+			1960           & fuzzy               & exact           & workid               \\
+			1150           & fatcat-crossref     & exact           & titleauthormatch     \\
+			1041           & fatcat-pubmed       & exact           & titleauthormatch     \\
+			895            & fuzzy               & strong          & figshareversion      \\
+			317            & fuzzy               & strong          & titleartifact        \\
+			82             & grobid              & strong          & titleartifact        \\
+			33             & crossref            & strong          & titleartifact        \\
+			5              & fuzzy               & strong          & custombsiundated     \\
+			1              & fuzzy               & strong          & custombsisubdoc      \\
+			1              & fatcat              & exact           & doi                  \\ \bottomrule
+		\end{tabular}
+		\vspace*{2mm}
+		\caption{Table of match counts, reference provenance, match status and
+			match reason. The match reason identifier encode a specific rule in the domain
+			dependent verification process and are included for completeness - we do not
+			include the details of each rule in this report.}
+		\label{table:matches}
+	\end{center}
 \end{table}
 
 \bibliographystyle{abbrv}
author	Martin Czygan <martin.czygan@gmail.com>	2021-08-08 14:02:44 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-08-08 14:02:44 +0200
commit	1c21e964b421e642fca4054f67ba7586bfcb683d (patch)
tree	8d311c2b5234ef4e52604b62d602e9375d5734ef
parent	eb057b6e2a1f74b9a74a164e13c9042332bb1244 (diff)
download	refcat-1c21e964b421e642fca4054f67ba7586bfcb683d.tar.gz refcat-1c21e964b421e642fca4054f67ba7586bfcb683d.zip