From de967bf895745b8a666061639a88b60d895b9372 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 8 Aug 2021 15:16:15 +0200 Subject: wip: paper tweaks --- docs/Simple/main.pdf | Bin 92494 -> 95632 bytes docs/Simple/main.tex | 119 +++++++++++++++++++++++++++++---------------------- docs/Simple/refs.bib | 48 +++++++++++++++++++++ 3 files changed, 115 insertions(+), 52 deletions(-) (limited to 'docs') diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf index b03ce61..e28c4c3 100644 Binary files a/docs/Simple/main.pdf and b/docs/Simple/main.pdf differ diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex index 6871020..e4febd9 100644 --- a/docs/Simple/main.tex +++ b/docs/Simple/main.tex @@ -49,7 +49,7 @@ Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the graph consists of 1,323,423,672 citations. We release this dataset under a CC0 Public Domain Dedication, accessible through an archive - collection\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All + item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All code used in the derivation process is released under an MIT license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}. \end{abstract} @@ -113,22 +113,33 @@ citations is not expected to shrink in the future. \section{Dataset} -We release the first version of the \emph{refcat} dataset -in an format used internally for storage and to serve queries (and which we -call \emph{biblioref} or \emph{bref} for short). The dataset includes metadata -from fatcat and the Open Library Project and inbound links from the English Wikipedia. - -The format contains source and target (fatcat release and work) identifiers, a +We release the first version of the \emph{refcat} dataset in an format used +internally for storage and to serve queries (and which we call \emph{biblioref} +or \emph{bref} for short). The dataset includes metadata from fatcat, the +Open Library Project and inbound links from the English Wikipedia. The fatcat +project itself aggregates data from variety of open data sources, such as +Crossref\citep{crossref}, PubMed\citep{canese2013pubmed}, +DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others, +as well as metadata generated from analysis of data preserved at the Internet +Archive and active crawls of publication sites on the web. + +The dataset is +integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users +to explore inbound and outbound references\cite{fatcatguidereferencegraph}. + +The format records source and target (fatcat release and work) identifiers, a few attributes from the metadata (such as year or release stage) as well as information about the match status and provanance. The dataset currently contains 1,323,423,672 citations across 76,327,662 -entities (55,123,635 unique source and 60,244,206 unique target work identifiers). +entities (55,123,635 unique source and 60,244,206 unique target work +identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI +for both source and target). The majority of matches - 1,250,523,321 - are established through identifier based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are -established through fuzzy matching. +established through fuzzy matching techniques. -The majority of DOI based matches between \emph{refcat} and COCI overlap, as can be +The majority of citations between \emph{refcat} and COCI overlap, as can be seen in~Table~\ref{table:cocicmp}. \begin{table}[] @@ -146,9 +157,10 @@ seen in~Table~\ref{table:cocicmp}. \end{tabular} \vspace*{2mm} \caption{Comparison between COCI and \emph{refcat-doi}, a subset of - \emph{refcat} where entities have a known DOI. At least 50\% of the 295,884,246 - references only in \emph{refcat-doi} come from links between datasets (GBIF, - DOI prefix: 10.15468).} + \emph{refcat} where entities have a known DOI. At least 50\% of the + 295,884,246 references only in \emph{refcat-doi} come from links + recorded within a specific dataset provider (GBIF, DOI prefix: + 10.15468).} \label{table:cocicmp} \end{center} \end{table} @@ -162,14 +174,14 @@ seen in~Table~\ref{table:cocicmp}. The constraints for the systems design are informed by the volume and the variety of the data. The capability to run the whole graph derivation on a -single machine was a minor goal as well. In total, the raw inputs amount to a few -TB of textual content, mostly newline delimited JSON. More importantly, while -the number of data fields is low, certain schemas are very partial with -hundreds of different combinations of available field values found in the raw -reference data. This is most likely caused by aggregators passing on reference -data coming from hundreds of sources, each of which not necessarily agreeing on -a common granularity for citation data and from artifacts of machine learning -based structured data extraction tools. +single machine was a minor goal as well. In total, the raw inputs amount to a +few terabytes of textual content, mostly newline delimited JSON. More +importantly, while the number of data fields is low, certain schemas are very +partial with hundreds of different combinations of available field values found +in the raw reference data. This is most likely caused by aggregators passing on +reference data coming from hundreds of sources, each of which not necessarily +agreeing on a common granularity for citation data and from artifacts of +machine learning based structured data extraction tools. Each combination of fields may require a slightly different processing path. For example, references with an Arxiv identifier can be processed differently @@ -211,14 +223,14 @@ same key and apply a function on each group in order to generate our target schema or perform additional operations such as deduplication or fusion of matched and unmatched references. -The key derivation can be exact (like an identifier like DOI, PMID, etc) or +The key derivation can be exact (via an identifier like DOI, PMID, etc) or based on a value normalization, like slugifying a title string. For identifier based matches we can generate the target schema directly. For fuzzy matching candidates, we pass possible match pairs through a verification procedure, -which is implemented for \emph{release entity} pairs. This procedure is a +which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a domain dependent rule based verification, able to identify different versions of a publication, preprint-published pairs and documents, which are -are similar by various metrics calculated over title and authors. The fuzzy matching +are similar by various metrics calculated over title and author fields. The fuzzy matching approach is applied on all reference documents without identifier (a title is currently required). @@ -242,22 +254,25 @@ As other dataset in this field we expect this dataset to be iterated upon. continously\footnote{A changelog can currenly be followed here: \url{https://fatcat.wiki/changelog}} and web crawls are conducted regularly. Current processing pipelines cover raw reference snapshot - creation and derivation the graph structure, which allows to rerun + creation and derivation of the graph structure, which allows to rerun processing based on updated data as it becomes available. \item Metadata extraction from PDFs depends on supervised machine learning - models, which in turn depends of available training sets. With additional crawls and + models, which in turn depend on available training datasets. With additional crawls and metadata available we hope to improve models used for metadata extraction, improving yield and reducing data extraction artifacts in the process. \item As of this version, a number of raw reference docs remain unmatched, which means that neither exact nor fuzzy matching - can detect a link to a known entity. On the one + has detected a link to a known entity. On the one hand, this can hint at missing metadata. However, parts of the data will contain a reference to a catalogued entity, but in a specific, dense and harder to recover form. This also include improvements to the fuzzy matching approach. + \item The reference dataset contains millions of URLs and their integration + into the graph has been implemented as prototype. A full implementation + requires a few data cleanup and normalization steps. \end{itemize} \section{Acknowledgements} @@ -307,33 +322,33 @@ more easily (see~Table~\ref{table:matches}). 52453 & fuzzy & strong & dataciterelatedid \\ 47119 & grobid & strong & slugtitleauthormatch \\ 36774 & fuzzy & strong & arxivversion \\ - 35311 & fuzzy & strong & customieeearxiv \\ - 33863 & grobid & exact & pmcid \\ - 23504 & crossref & strong & slugtitleauthormatch \\ - 22753 & fatcat-crossref & strong & tokenizedauthors \\ - 17720 & grobid & exact & titleauthormatch \\ - 14656 & crossref & exact & titleauthormatch \\ - 14438 & grobid & strong & tokenizedauthors \\ - 7682 & fatcat-crossref & exact & arxiv \\ - 5972 & fatcat-crossref & exact & isbn \\ - 5525 & fatcat-pubmed & exact & arxiv \\ - 4290 & fatcat-pubmed & strong & tokenizedauthors \\ - 2745 & fatcat-pubmed & exact & isbn \\ - 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ - 2273 & fatcat-crossref & strong & slugtitleauthormatch \\ - 1960 & fuzzy & exact & workid \\ - 1150 & fatcat-crossref & exact & titleauthormatch \\ - 1041 & fatcat-pubmed & exact & titleauthormatch \\ - 895 & fuzzy & strong & figshareversion \\ - 317 & fuzzy & strong & titleartifact \\ - 82 & grobid & strong & titleartifact \\ - 33 & crossref & strong & titleartifact \\ - 5 & fuzzy & strong & custombsiundated \\ - 1 & fuzzy & strong & custombsisubdoc \\ - 1 & fatcat & exact & doi \\ \bottomrule + % 35311 & fuzzy & strong & customieeearxiv \\ + % 33863 & grobid & exact & pmcid \\ + % 23504 & crossref & strong & slugtitleauthormatch \\ + % 22753 & fatcat-crossref & strong & tokenizedauthors \\ + % 17720 & grobid & exact & titleauthormatch \\ + % 14656 & crossref & exact & titleauthormatch \\ + % 14438 & grobid & strong & tokenizedauthors \\ + % 7682 & fatcat-crossref & exact & arxiv \\ + % 5972 & fatcat-crossref & exact & isbn \\ + % 5525 & fatcat-pubmed & exact & arxiv \\ + % 4290 & fatcat-pubmed & strong & tokenizedauthors \\ + % 2745 & fatcat-pubmed & exact & isbn \\ + % 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ + % 2273 & fatcat-crossref & strong & slugtitleauthormatch \\ + % 1960 & fuzzy & exact & workid \\ + % 1150 & fatcat-crossref & exact & titleauthormatch \\ + % 1041 & fatcat-pubmed & exact & titleauthormatch \\ + % 895 & fuzzy & strong & figshareversion \\ + % 317 & fuzzy & strong & titleartifact \\ + % 82 & grobid & strong & titleartifact \\ + % 33 & crossref & strong & titleartifact \\ + % 5 & fuzzy & strong & custombsiundated \\ + % 1 & fuzzy & strong & custombsisubdoc \\ + % 1 & fatcat & exact & doi \\ \bottomrule \end{tabular} \vspace*{2mm} - \caption{Table of match counts, reference provenance, match status and + \caption{Table of match counts (top 25), reference provenance, match status and match reason. The match reason identifier encode a specific rule in the domain dependent verification process and are included for completeness - we do not include the details of each rule in this report.} diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib index 5ae3fc8..c61021e 100644 --- a/docs/Simple/refs.bib +++ b/docs/Simple/refs.bib @@ -101,6 +101,54 @@ howpublished = {\url{https://i4oc.org/}}, note = {Accessed: 2021-07-30} } +@misc{fatcatguidereferencegraph, +title = {The Fatcat Guide: Reference Graph (refcat)}, +howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}}, +note = {Accessed: 2021-08-08} +} + +@misc{crossref, +title = {Crossref}, +howpublished = {\url{https://crossref.org}}, +note = {Accessed: 2021-08-08} +} + +@misc{doaj, +title = {Directory of Open Access Journals}, +howpublished = {\url{https://doaj.org}}, +note = {Accessed: 2021-08-08} +} + +@inproceedings{ley2002dblp, + title={The DBLP computer science bibliography: Evolution, research issues, perspectives}, + author={Ley, Michael}, + booktitle={International symposium on string processing and information retrieval}, + pages={1--10}, + year={2002}, + organization={Springer} +} + + +@inproceedings{brase2009datacite, + title={DataCite-A global registration agency for research data}, + author={Brase, Jan}, + booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology}, + pages={257--261}, + year={2009}, + organization={IEEE} +} + +@article{canese2013pubmed, + title={PubMed: the bibliographic database}, + author={Canese, Kathi and Weis, Sarah}, + journal={The NCBI Handbook}, + volume={2}, + pages={1}, + year={2013}, + publisher={National Center for Biotechnology Information (US)} +} + + @article{shotton2018funders, title={Funders should mandate open citations.}, author={Shotton, David}, -- cgit v1.2.3