From a23c867858792d84747e2fd9f93a0cd13251b40f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 10 Sep 2021 09:47:51 +0200 Subject: docs: output structure table stub --- docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 103256 -> 103881 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 28 +++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'docs') diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf index 0ba95d9..e1f367a 100644 Binary files a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex index ae93e47..b729d48 100644 --- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -215,6 +215,32 @@ in~Table~\ref{table:cocicmp}. % TODO: some more numbers on the structure +* doi-to-doi +* only source doi +* only target doi +* paper-to-book (OL) +* wikipedia-to-paper (WI) + +\begin{table}[] + \begin{center} + \begin{tabular}{ll} + \toprule + \bf{Class} & \bf{Count} \\ + \midrule + total & \\ + doi-doi & \\ + source-only doi & \\ + target-only doi & \\ + edge w/o doi & \\ + target-open-library & \\ + source-wikipedia & \\ + \end{tabular} + \vspace*{2mm} + \caption{Output structure, e.g. edges between documents that both have a doi (doi-doi).} + \label{table:structure} + \end{center} +\end{table} + \section{System Design} \subsection{Constraints} @@ -268,7 +294,7 @@ harvests and imports web accessible sources such as Crossref, Pubmed, Arxiv, Datacite, DOAJ, dblp and others into its catalog (as the source permits, data is processed continously or in batches). Reference data from PDF documents has been extracted with GROBID\footnote{GROBID -\href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the + \href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the TEI-XML results being cached locally in a key-value store accessible with an S3 API. Archived PDF documents result from dedicated web-scale crawls of scholarly domains conducted with -- cgit v1.2.3