diff options
-rw-r--r-- | docs/Simple/main.pdf | bin | 92494 -> 92494 bytes | |||
-rw-r--r-- | docs/Simple/main.tex | 304 |
2 files changed, 152 insertions, 152 deletions
diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf Binary files differindex 9545257..71ab5ca 100644 --- a/docs/Simple/main.pdf +++ b/docs/Simple/main.pdf diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex index 2c3001d..6871020 100644 --- a/docs/Simple/main.tex +++ b/docs/Simple/main.tex @@ -21,17 +21,17 @@ \title{Fatcat Reference Dataset} \author{Martin Czygan \\ -\\ -Internet Archive \\ -San Francisco, California, USA \\ -martin@archive.org \\ -\and -Bryan Newbold \\ -\\ -Internet Archive \\ -San Francisco, California, USA \\ -bnewbold@archive.org \\ -\\ + \\ + Internet Archive \\ + San Francisco, California, USA \\ + martin@archive.org \\ + \and + Bryan Newbold \\ + \\ + Internet Archive \\ + San Francisco, California, USA \\ + bnewbold@archive.org \\ + \\ } @@ -40,18 +40,18 @@ bnewbold@archive.org \\ \begin{abstract} -As part of its scholarly data efforts, the Internet Archive releases a first version of a citation -graph dataset, named \emph{refcat}, derived from scholarly publications and -additional data sources. It is composed of data gathered by the fatcat -cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale -crawls targeting primary and secondary scholarly outputs, as well as metadata -from the Open Library\footnote{\url{https://openlibrary.org}} project and -Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the -graph consists of 1,323,423,672 citations. We release this dataset under a CC0 -Public Domain Dedication, accessible through an archive -collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All -code used in the derivation process is released under an MIT -license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}. + As part of its scholarly data efforts, the Internet Archive releases a first version of a citation + graph dataset, named \emph{refcat}, derived from scholarly publications and + additional data sources. It is composed of data gathered by the fatcat + cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale + crawls targeting primary and secondary scholarly outputs, as well as metadata + from the Open Library\footnote{\url{https://openlibrary.org}} project and + Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the + graph consists of 1,323,423,672 citations. We release this dataset under a CC0 + Public Domain Dedication, accessible through an archive + collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All + code used in the derivation process is released under an MIT + license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}. \end{abstract} \keywords{Citation Graph, Web Archiving} @@ -99,7 +99,7 @@ publications\footnote{\url{http://wikicite.org/statistics.html}}. Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}} with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at -\url{https://archive.org/details/mag-2021-06-07}} the + \url{https://archive.org/details/mag-2021-06-07}} the \emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466 bibliographic entities. @@ -132,25 +132,25 @@ The majority of DOI based matches between \emph{refcat} and COCI overlap, as can seen in~Table~\ref{table:cocicmp}. \begin{table}[] - \begin{center} - \begin{tabular}{ll} -\toprule -\bf{Set} & \bf{Count} \\ - -\midrule - COCI (C) & 1,094,394,688 \\ - \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst - C $\cap$ R & 1,007,539,966 \\ - C $\setminus$ R & 86,854,309 \\ - R $\setminus$ C & 295,884,246 - \end{tabular} - \vspace*{2mm} - \caption{Comparison between COCI and \emph{refcat-doi}, a subset of -\emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246 -references only in \emph{refcat-doi} come from links between datasets (GBIF, -DOI prefix: 10.15468).} - \label{table:cocicmp} - \end{center} + \begin{center} + \begin{tabular}{ll} + \toprule + \bf{Set} & \bf{Count} \\ + + \midrule + COCI (C) & 1,094,394,688 \\ + \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst + C $\cap$ R & 1,007,539,966 \\ + C $\setminus$ R & 86,854,309 \\ + R $\setminus$ C & 295,884,246 + \end{tabular} + \vspace*{2mm} + \caption{Comparison between COCI and \emph{refcat-doi}, a subset of + \emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246 + references only in \emph{refcat-doi} come from links between datasets (GBIF, + DOI prefix: 10.15468).} + \label{table:cocicmp} + \end{center} \end{table} % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst @@ -178,32 +178,32 @@ from a set of eight field set manifestations, as listed in Table~\ref{table:fields}. \begin{table}[] - \begin{center} - \begin{tabular}{ll} -\toprule - \bf{Fields} & \bf{Percentage} \\ -\midrule - \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ - \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ - \end{tabular} - \vspace*{2mm} - \caption{Top 8 combinations of available fields in raw reference data - accounting for about 53\% of the total data (CN = container name, CRN = -contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = -issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} - \label{table:fields} -\end{center} + \begin{center} + \begin{tabular}{ll} + \toprule + \bf{Fields} & \bf{Percentage} \\ + \midrule + \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ + \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ + \end{tabular} + \vspace*{2mm} + \caption{Top 8 combinations of available fields in raw reference data + accounting for about 53\% of the total data (CN = container name, CRN = + contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = + issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} + \label{table:fields} + \end{center} \end{table} Overall, a map-reduce style\citep{dean2010mapreduce} approach is followed\footnote{While the operations are similar, the processing is not -distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows + distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows for some uniformity in the overall processing. We extract (key, document) tuples (as TSV) from the raw JSON data and sort by key. We then group documents with the @@ -229,8 +229,8 @@ candidate generation phase in order to improve recall, but we are strict during verification, in order to control precision. Quality assurance for verification is implemented through a growing list of test cases of real examples from the catalog and their expected or desired match status\footnote{The list can be found under: -\url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}. -It is helpful to keep this test suite independent of any specific programming language.}. + \url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}. + It is helpful to keep this test suite independent of any specific programming language.}. \section{Limitations and Future Work} @@ -238,32 +238,32 @@ It is helpful to keep this test suite independent of any specific programming la As other dataset in this field we expect this dataset to be iterated upon. \begin{itemize} - \item The fatcat catalog updates its metadata - continously\footnote{A changelog can currenly be followed here: - \url{https://fatcat.wiki/changelog}} and web crawls are conducted - regularly. Current processing pipelines cover raw reference snapshot - creation and derivation the graph structure, which allows to rerun - processing based on updated data as it becomes available. - - \item Metadata extraction from PDFs depends on supervised machine learning - models, which in turn depends of available training sets. With additional crawls and - metadata available we hope to improve models used for metadata - extraction, improving yield and reducing data extraction artifacts in - the process. - - \item As of this version, a number of raw reference - docs remain unmatched, which means that neither exact nor fuzzy matching - can detect a link to a known entity. On the one - hand, this can hint at missing metadata. However, parts of the data - will contain a reference to a catalogued entity, but in a specific, - dense and harder to recover form. - This also include improvements to the fuzzy matching approach. - \end{itemize} + \item The fatcat catalog updates its metadata + continously\footnote{A changelog can currenly be followed here: + \url{https://fatcat.wiki/changelog}} and web crawls are conducted + regularly. Current processing pipelines cover raw reference snapshot + creation and derivation the graph structure, which allows to rerun + processing based on updated data as it becomes available. + + \item Metadata extraction from PDFs depends on supervised machine learning + models, which in turn depends of available training sets. With additional crawls and + metadata available we hope to improve models used for metadata + extraction, improving yield and reducing data extraction artifacts in + the process. + + \item As of this version, a number of raw reference + docs remain unmatched, which means that neither exact nor fuzzy matching + can detect a link to a known entity. On the one + hand, this can hint at missing metadata. However, parts of the data + will contain a reference to a catalogued entity, but in a specific, + dense and harder to recover form. + This also include improvements to the fuzzy matching approach. +\end{itemize} \section{Acknowledgements} This work is partially supported by a grant from the \emph{Andrew W. Mellon -Foundation}. + Foundation}. \section{Appendix A} @@ -276,69 +276,69 @@ especially for fuzzy matching to be able to zoom in on systematic errors more easily (see~Table~\ref{table:matches}). \begin{table}[] - \footnotesize - \captionsetup{font=normalsize} - \begin{center} -\begin{tabular}{@{}rlll@{}} -\toprule -\textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule -934932865 & crossref & exact & doi \\ -151366108 & fatcat-datacite & exact & doi \\ -65345275 & fatcat-pubmed & exact & pmid \\ -48778607 & fuzzy & strong & jaccardauthors \\ -42465250 & grobid & exact & doi \\ -29197902 & fatcat-pubmed & exact & doi \\ -19996327 & fatcat-crossref & exact & doi \\ -11996694 & fuzzy & strong & slugtitleauthormatch \\ -9157498 & fuzzy & strong & tokenizedauthors \\ -3547594 & grobid & exact & arxiv \\ -2310025 & fuzzy & exact & titleauthormatch \\ -1496515 & grobid & exact & pmid \\ -680722 & crossref & strong & jaccardauthors \\ -476331 & fuzzy & strong & versioneddoi \\ -449271 & grobid & exact & isbn \\ -230645 & fatcat-crossref & strong & jaccardauthors \\ -190578 & grobid & strong & jaccardauthors \\ -156657 & crossref & exact & isbn \\ -123681 & fatcat-pubmed & strong & jaccardauthors \\ -79328 & crossref & exact & arxiv \\ -57414 & crossref & strong & tokenizedauthors \\ -53480 & fuzzy & strong & pmiddoipair \\ -52453 & fuzzy & strong & dataciterelatedid \\ -47119 & grobid & strong & slugtitleauthormatch \\ -36774 & fuzzy & strong & arxivversion \\ -35311 & fuzzy & strong & customieeearxiv \\ -33863 & grobid & exact & pmcid \\ -23504 & crossref & strong & slugtitleauthormatch \\ -22753 & fatcat-crossref & strong & tokenizedauthors \\ -17720 & grobid & exact & titleauthormatch \\ -14656 & crossref & exact & titleauthormatch \\ -14438 & grobid & strong & tokenizedauthors \\ -7682 & fatcat-crossref & exact & arxiv \\ -5972 & fatcat-crossref & exact & isbn \\ -5525 & fatcat-pubmed & exact & arxiv \\ -4290 & fatcat-pubmed & strong & tokenizedauthors \\ -2745 & fatcat-pubmed & exact & isbn \\ -2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ -2273 & fatcat-crossref & strong & slugtitleauthormatch \\ -1960 & fuzzy & exact & workid \\ -1150 & fatcat-crossref & exact & titleauthormatch \\ -1041 & fatcat-pubmed & exact & titleauthormatch \\ -895 & fuzzy & strong & figshareversion \\ -317 & fuzzy & strong & titleartifact \\ -82 & grobid & strong & titleartifact \\ -33 & crossref & strong & titleartifact \\ -5 & fuzzy & strong & custombsiundated \\ -1 & fuzzy & strong & custombsisubdoc \\ -1 & fatcat & exact & doi \\ \bottomrule -\end{tabular} - \vspace*{2mm} - \caption{Table of match counts, reference provenance, match status and -match reason. The match reason identifier encode a specific rule in the domain -dependent verification process and are included for completeness - we do not -include the details of each rule in this report.} - \label{table:matches} -\end{center} + \footnotesize + \captionsetup{font=normalsize} + \begin{center} + \begin{tabular}{@{}rlll@{}} + \toprule + \textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule + 934932865 & crossref & exact & doi \\ + 151366108 & fatcat-datacite & exact & doi \\ + 65345275 & fatcat-pubmed & exact & pmid \\ + 48778607 & fuzzy & strong & jaccardauthors \\ + 42465250 & grobid & exact & doi \\ + 29197902 & fatcat-pubmed & exact & doi \\ + 19996327 & fatcat-crossref & exact & doi \\ + 11996694 & fuzzy & strong & slugtitleauthormatch \\ + 9157498 & fuzzy & strong & tokenizedauthors \\ + 3547594 & grobid & exact & arxiv \\ + 2310025 & fuzzy & exact & titleauthormatch \\ + 1496515 & grobid & exact & pmid \\ + 680722 & crossref & strong & jaccardauthors \\ + 476331 & fuzzy & strong & versioneddoi \\ + 449271 & grobid & exact & isbn \\ + 230645 & fatcat-crossref & strong & jaccardauthors \\ + 190578 & grobid & strong & jaccardauthors \\ + 156657 & crossref & exact & isbn \\ + 123681 & fatcat-pubmed & strong & jaccardauthors \\ + 79328 & crossref & exact & arxiv \\ + 57414 & crossref & strong & tokenizedauthors \\ + 53480 & fuzzy & strong & pmiddoipair \\ + 52453 & fuzzy & strong & dataciterelatedid \\ + 47119 & grobid & strong & slugtitleauthormatch \\ + 36774 & fuzzy & strong & arxivversion \\ + 35311 & fuzzy & strong & customieeearxiv \\ + 33863 & grobid & exact & pmcid \\ + 23504 & crossref & strong & slugtitleauthormatch \\ + 22753 & fatcat-crossref & strong & tokenizedauthors \\ + 17720 & grobid & exact & titleauthormatch \\ + 14656 & crossref & exact & titleauthormatch \\ + 14438 & grobid & strong & tokenizedauthors \\ + 7682 & fatcat-crossref & exact & arxiv \\ + 5972 & fatcat-crossref & exact & isbn \\ + 5525 & fatcat-pubmed & exact & arxiv \\ + 4290 & fatcat-pubmed & strong & tokenizedauthors \\ + 2745 & fatcat-pubmed & exact & isbn \\ + 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ + 2273 & fatcat-crossref & strong & slugtitleauthormatch \\ + 1960 & fuzzy & exact & workid \\ + 1150 & fatcat-crossref & exact & titleauthormatch \\ + 1041 & fatcat-pubmed & exact & titleauthormatch \\ + 895 & fuzzy & strong & figshareversion \\ + 317 & fuzzy & strong & titleartifact \\ + 82 & grobid & strong & titleartifact \\ + 33 & crossref & strong & titleartifact \\ + 5 & fuzzy & strong & custombsiundated \\ + 1 & fuzzy & strong & custombsisubdoc \\ + 1 & fatcat & exact & doi \\ \bottomrule + \end{tabular} + \vspace*{2mm} + \caption{Table of match counts, reference provenance, match status and + match reason. The match reason identifier encode a specific rule in the domain + dependent verification process and are included for completeness - we do not + include the details of each rule in this report.} + \label{table:matches} + \end{center} \end{table} \bibliographystyle{abbrv} |