aboutsummaryrefslogtreecommitdiffstats
path: root/docs/TR-20210808100000-IA-WDS-REFCAT
diff options
context:
space:
mode:
Diffstat (limited to 'docs/TR-20210808100000-IA-WDS-REFCAT')
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.pdfbin103256 -> 103881 bytes
-rw-r--r--docs/TR-20210808100000-IA-WDS-REFCAT/main.tex28
2 files changed, 27 insertions, 1 deletions
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
index 0ba95d9..e1f367a 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf
Binary files differ
diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
index ae93e47..b729d48 100644
--- a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
+++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex
@@ -215,6 +215,32 @@ in~Table~\ref{table:cocicmp}.
% TODO: some more numbers on the structure
+* doi-to-doi
+* only source doi
+* only target doi
+* paper-to-book (OL)
+* wikipedia-to-paper (WI)
+
+\begin{table}[]
+ \begin{center}
+ \begin{tabular}{ll}
+ \toprule
+ \bf{Class} & \bf{Count} \\
+ \midrule
+ total & \\
+ doi-doi & \\
+ source-only doi & \\
+ target-only doi & \\
+ edge w/o doi & \\
+ target-open-library & \\
+ source-wikipedia & \\
+ \end{tabular}
+ \vspace*{2mm}
+ \caption{Output structure, e.g. edges between documents that both have a doi (doi-doi).}
+ \label{table:structure}
+ \end{center}
+\end{table}
+
\section{System Design}
\subsection{Constraints}
@@ -268,7 +294,7 @@ harvests and imports web accessible sources such as Crossref, Pubmed, Arxiv,
Datacite, DOAJ, dblp and others into its catalog (as the source permits, data
is processed continously or in batches). Reference data from PDF documents has
been extracted with GROBID\footnote{GROBID
-\href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the
+ \href{https://github.com/kermitt2/grobid/releases/tag/0.5.5}{v0.5.5}}, with the
TEI-XML results being cached locally in a key-value store accessible with an S3
API. Archived PDF documents result from dedicated web-scale crawls of scholarly
domains conducted with