From 3a79551dfe54ba668f7eee9de88625a0d33d9c7f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Sep 2021 20:34:21 +0200 Subject: docs: version with refs_types graph --- docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 93825 -> 140144 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 35 +++++++++++++++++---- .../TR-20210808100000-IA-WDS-REFCAT/refs_types.dot | 18 +++++++++++ .../TR-20210808100000-IA-WDS-REFCAT/refs_types.png | Bin 0 -> 42338 bytes 4 files changed, 47 insertions(+), 6 deletions(-) create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.dot create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.png (limited to 'docs') diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf index f4273e4..97cfc56 100644 Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index b278149..b020a47 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -322,10 +322,12 @@ is processed continously or in batches). Reference data from PDF documents has been extracted with GROBID\footnote{GROBID \href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the TEI-XML results being cached locally in a key-value store accessible with an S3 -API. Archived PDF documents result from dedicated web-scale crawls of scholarly -domains conducted with -Heritrix\footnote{\href{https://github.com/internetarchive/heritrix3}{https://github.com/internetarchive/heritrix3}} (and -other crawl technologies) and a variety of seed lists targeting journal +API\footnote{Currently, + \href{https://github.com/chrislusf/seaweedfs}{https://github.com/chrislusf/seaweedfs} + is used}. Archived PDF documents result from dedicated web-scale crawls of +scholarly domains conducted with +Heritrix\footnote{\href{https://github.com/internetarchive/heritrix3}{https://github.com/internetarchive/heritrix3}} +(and other crawl technologies) and a variety of seed lists targeting journal homepages, repositories, dataset providers, aggregators, web archives and other venues. A processing pipeline merges catalog data from the primary database and cached data from the key-value store and generates the set of about 2.5B @@ -367,6 +369,13 @@ derivations, as data dependencies are encoded with the help of the orchestrator. Within the tasks, we also utilize classic platform tools such as \emph{sort}~\citep{mcilroy1971research}. +During a last processing step, we fuse reference matches and unmatched items +into a single, indexable file. This step includes deduplication of different +matching methods (e.g. prefer exact matches over fuzzy matches). This file is +indexed into an search index and serves both matched and unmatched references +for the web application, allowing for further collection of feedback on match +quality and possible improvements. + With a few schema conversions, fuzzy matching can be applied to Wikipedia articles and Open Library (edition) records as well. The aspect of precision and recall are represented by the two stages: we are generous in the match @@ -416,6 +425,19 @@ This work is partially supported by a grant from the \emph{Andrew W. Mellon \section{Appendix A} +Figure~\ref{fig:types} shows a schematics of actual and possible reference entities. + +\begin{figure}[h] + \centering + \includegraphics[width=0.45\textwidth]{refs_types.png} + \caption{Schematics of the main reference entities; green: included in the + corpus; orange: currently in development; gray: Planned, but not in + development; red: long-term desiderata.} + \label{fig:types} +\end{figure} + +\section{Appendix B} + A note on data quality: While we implement various data quality measures, real-world data, especially coming from many different sources will contain @@ -491,7 +513,8 @@ more easily (see~Table~\ref{table:matches}). \end{center} \end{table} -\bibliographystyle{abbrv} -% \bibliographystyle{plainnat} + +% \bibliographystyle{abbrv} +\bibliographystyle{plainnat} \bibliography{refs} \end{document} diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.dot b/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.dot new file mode 100644 index 0000000..ec8cc00 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.dot @@ -0,0 +1,18 @@ +// dot -Tpng refs_types.dot > refs_types.png +digraph { + Papers[label="Papers\n(fatcat; incl. datasets)",shape=rect] + Books[label="Books\n(openlibrary)",shape=rect] + Encyclopedia[label="Encyclopedia\n(wikipedia)",shape=rect] + Web[label="Web\n(wayback)",shape=rect] + // OtherMedia[label="Other Media? TODO\n(archive.org)", color=orange] + Unmatched[label="Unknown/Unmatched", color=grey] + Papers -> Papers[color=green,label="1.3 billion",penwidth=2] + Papers -> Unmatched[color=orange,label="0.7 billion",penwidth=2] + Papers -> Books[color=green,label="20 million"] + Papers -> Web[color=orange,label="WIP / 25 million"] + Books -> Papers[color=red] + Books -> Web[color=red] + Books -> Books[color=red] + Encyclopedia -> Papers[color=green,label="1.3 million"] + Encyclopedia -> Books[color=gray,label="TODO"] +} diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.png b/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.png new file mode 100644 index 0000000..d3cd278 Binary files /dev/null and b/docs/TR-20210808100000-IA-WDS-REFCAT/refs_types.png differ -- cgit v1.2.3