From bd66b58cded2c2c7e7b7e5d374434d6531dd70de Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 8 Aug 2021 15:18:29 +0200 Subject: docs: cleanup and naming --- docs/Simple/.gitignore | 5 - docs/Simple/LICENSE | 21 - docs/Simple/Makefile | 17 - docs/Simple/README.md | 2 - docs/Simple/figure.pdf | Bin 215353 -> 0 bytes docs/Simple/main.pdf | Bin 95636 -> 0 bytes docs/Simple/main.tex | 362 ----------------- docs/Simple/refs.bib | 228 ----------- docs/Simple/simpleConference.sty | 136 ------- docs/TR-20210730212057-IA-WDS-CG/.gitignore | 5 - docs/TR-20210730212057-IA-WDS-CG/Makefile | 9 - docs/TR-20210730212057-IA-WDS-CG/README.md | 49 --- docs/TR-20210730212057-IA-WDS-CG/arxiv.sty | 262 ------------ docs/TR-20210730212057-IA-WDS-CG/main.pdf | Bin 99346 -> 0 bytes docs/TR-20210730212057-IA-WDS-CG/main.tex | 442 --------------------- docs/TR-20210730212057-IA-WDS-CG/references.bib | 123 ------ docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore | 5 + docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE | 21 + docs/TR-20210808100000-IA-WDS-REFCAT/Makefile | 17 + docs/TR-20210808100000-IA-WDS-REFCAT/README.md | 2 + docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf | Bin 0 -> 215353 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf | Bin 0 -> 95636 bytes docs/TR-20210808100000-IA-WDS-REFCAT/main.tex | 362 +++++++++++++++++ docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib | 228 +++++++++++ .../simpleConference.sty | 136 +++++++ 25 files changed, 771 insertions(+), 1661 deletions(-) delete mode 100644 docs/Simple/.gitignore delete mode 100644 docs/Simple/LICENSE delete mode 100644 docs/Simple/Makefile delete mode 100644 docs/Simple/README.md delete mode 100644 docs/Simple/figure.pdf delete mode 100644 docs/Simple/main.pdf delete mode 100644 docs/Simple/main.tex delete mode 100644 docs/Simple/refs.bib delete mode 100644 docs/Simple/simpleConference.sty delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/.gitignore delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/Makefile delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/README.md delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/arxiv.sty delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/main.pdf delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/main.tex delete mode 100644 docs/TR-20210730212057-IA-WDS-CG/references.bib create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/Makefile create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/README.md create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/main.tex create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib create mode 100644 docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty diff --git a/docs/Simple/.gitignore b/docs/Simple/.gitignore deleted file mode 100644 index 5040d53..0000000 --- a/docs/Simple/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.log -*.aux -*.bbl -*.blg -*.out diff --git a/docs/Simple/LICENSE b/docs/Simple/LICENSE deleted file mode 100644 index 9f5c70f..0000000 --- a/docs/Simple/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Ruoho Ruotsi - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/docs/Simple/Makefile b/docs/Simple/Makefile deleted file mode 100644 index 11264f8..0000000 --- a/docs/Simple/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -main.pdf: main.tex - latexindent -w main.tex && rm -f main.bak* - pdflatex main.tex - bibtex main - pdflatex main.tex - pdflatex main.tex - - -.PHONY: clean -clean: - rm -f main.pdf - rm -f main.aux - rm -f main.log - rm -f main.bbl - rm -f main.blg - rm -f main.out - diff --git a/docs/Simple/README.md b/docs/Simple/README.md deleted file mode 100644 index 3a56517..0000000 --- a/docs/Simple/README.md +++ /dev/null @@ -1,2 +0,0 @@ -# latex-template-arxiv-preprint -A simple LaTeX template for Technical Reports, arXiv preprints & 2-column Conference papers diff --git a/docs/Simple/figure.pdf b/docs/Simple/figure.pdf deleted file mode 100644 index b21876a..0000000 Binary files a/docs/Simple/figure.pdf and /dev/null differ diff --git a/docs/Simple/main.pdf b/docs/Simple/main.pdf deleted file mode 100644 index 3b431cc..0000000 Binary files a/docs/Simple/main.pdf and /dev/null differ diff --git a/docs/Simple/main.tex b/docs/Simple/main.tex deleted file mode 100644 index e4febd9..0000000 --- a/docs/Simple/main.tex +++ /dev/null @@ -1,362 +0,0 @@ -\documentclass[hidelinks,10pt,twocolumn]{article} -\usepackage{simpleConference} -\usepackage[utf8]{inputenc} -\usepackage{times} -\usepackage{graphicx} -\usepackage{natbib} -\usepackage{doi} -\usepackage{amssymb} -\usepackage{url,hyperref} -\usepackage{booktabs} % professional-quality tables -\usepackage{amsfonts} % blackboard math symbols -\usepackage{nicefrac} % compact symbols for 1/2, etc. -\usepackage{caption} - -\usepackage{datetime} -\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1} -\setlength{\parindent}{0pt} - -\begin{document} - -\title{Fatcat Reference Dataset} - -\author{Martin Czygan \\ - \\ - Internet Archive \\ - San Francisco, California, USA \\ - martin@archive.org \\ - \and - Bryan Newbold \\ - \\ - Internet Archive \\ - San Francisco, California, USA \\ - bnewbold@archive.org \\ - \\ -} - - -\maketitle -\thispagestyle{empty} - - -\begin{abstract} - As part of its scholarly data efforts, the Internet Archive releases a first version of a citation - graph dataset, named \emph{refcat}, derived from scholarly publications and - additional data sources. It is composed of data gathered by the fatcat - cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale - crawls targeting primary and secondary scholarly outputs, as well as metadata - from the Open Library\footnote{\url{https://openlibrary.org}} project and - Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the - graph consists of 1,323,423,672 citations. We release this dataset under a CC0 - Public Domain Dedication, accessible through an archive - item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All - code used in the derivation process is released under an MIT - license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}. -\end{abstract} - -\keywords{Citation Graph, Web Archiving} - -\section{Introduction} - - -The Internet Archive releases a first version of a citation graph dataset -derived from a raw corpus of about 2.5B references gathered from metadata and -data obtained by PDF extraction tools such as -GROBID\cite{lopez2009grobid}. Additionally, we consider integration with -metadata from Open Library and Wikipedia. -The goal of this report is to describe briefly the current contents and the -derivation of the dataset. We expect -this dataset to be iterated upon, with changes both in content and processing. - -Modern citation indexes can be traced back to the early computing age, when -projects like the Science Citation Index (1955)\citep{garfield2007evolution} -were first devised, living on in existing commercial knowledge bases today. -Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 -- the first version of which contained 6,325,178 individual -references\citep{shotton2013publishing}. Other notable early projects -include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last -decade has seen the emergence of more openly available, large scale -citation projects like Microsoft Academic\citep{sinha2015overview} or the -Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021, -according to \citep{hutchins2021tipping} over 1B citations are publicly -available, marking a tipping point for this category of data. - -\section{Related Work} - -There are a few large scale citation dataset available today. COCI, the -``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first -released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on -2021-07-29, it contains -1,094,394,688 citations across 65,835,422 bibliographic -resources\citep{peroni2020opencitations}. - -The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project, -``a Wikimedia initiative to develop open citations and linked bibliographic -data to serve free knowledge'' continously adds citations to its database and -as of 2021-06-28 tracks 253,719,394 citations across 39,994,937 -publications\footnote{\url{http://wikicite.org/statistics.html}}. - -Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of -entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}} -with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at - \url{https://archive.org/details/mag-2021-06-07}} the -\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466 -bibliographic entities. - -Numerous other projects have been or are concerned with various aspects of -citation discovery and curation as part their feature set, among them Semantic -Scholar\citep{fricke2018semantic}, CiteSeerX\citep{li2006citeseerx} or Aminer\citep{tang2016aminer}. - -As mentioned in \citep{hutchins2021tipping}, the number of openly available -citations is not expected to shrink in the future. - - -\section{Dataset} - -We release the first version of the \emph{refcat} dataset in an format used -internally for storage and to serve queries (and which we call \emph{biblioref} -or \emph{bref} for short). The dataset includes metadata from fatcat, the -Open Library Project and inbound links from the English Wikipedia. The fatcat -project itself aggregates data from variety of open data sources, such as -Crossref\citep{crossref}, PubMed\citep{canese2013pubmed}, -DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others, -as well as metadata generated from analysis of data preserved at the Internet -Archive and active crawls of publication sites on the web. - -The dataset is -integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users -to explore inbound and outbound references\cite{fatcatguidereferencegraph}. - -The format records source and target (fatcat release and work) identifiers, a -few attributes from the metadata (such as year or release stage) as well as -information about the match status and provanance. - -The dataset currently contains 1,323,423,672 citations across 76,327,662 -entities (55,123,635 unique source and 60,244,206 unique target work -identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI -for both source and target). -The majority of matches - 1,250,523,321 - are established through identifier -based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are -established through fuzzy matching techniques. - -The majority of citations between \emph{refcat} and COCI overlap, as can be -seen in~Table~\ref{table:cocicmp}. - -\begin{table}[] - \begin{center} - \begin{tabular}{ll} - \toprule - \bf{Set} & \bf{Count} \\ - - \midrule - COCI (C) & 1,094,394,688 \\ - \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst - C $\cap$ R & 1,007,539,966 \\ - C $\setminus$ R & 86,854,309 \\ - R $\setminus$ C & 295,884,246 - \end{tabular} - \vspace*{2mm} - \caption{Comparison between COCI and \emph{refcat-doi}, a subset of - \emph{refcat} where entities have a known DOI. At least 50\% of the - 295,884,246 references only in \emph{refcat-doi} come from links - recorded within a specific dataset provider (GBIF, DOI prefix: - 10.15468).} - \label{table:cocicmp} - \end{center} -\end{table} - -% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst -% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst -% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst - - -\section{System Design} - -The constraints for the systems design are informed by the volume and the -variety of the data. The capability to run the whole graph derivation on a -single machine was a minor goal as well. In total, the raw inputs amount to a -few terabytes of textual content, mostly newline delimited JSON. More -importantly, while the number of data fields is low, certain schemas are very -partial with hundreds of different combinations of available field values found -in the raw reference data. This is most likely caused by aggregators passing on -reference data coming from hundreds of sources, each of which not necessarily -agreeing on a common granularity for citation data and from artifacts of -machine learning based structured data extraction tools. - -Each combination of fields may require a slightly different processing path. -For example, references with an Arxiv identifier can be processed differently -from references with only a title. Over 50\% of the raw reference data comes -from a set of eight field set manifestations, as listed in -Table~\ref{table:fields}. - -\begin{table}[] - \begin{center} - \begin{tabular}{ll} - \toprule - \bf{Fields} & \bf{Percentage} \\ - \midrule - \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ - \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ - \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ - \end{tabular} - \vspace*{2mm} - \caption{Top 8 combinations of available fields in raw reference data - accounting for about 53\% of the total data (CN = container name, CRN = - contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = - issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} - \label{table:fields} - \end{center} -\end{table} - -Overall, a map-reduce style\citep{dean2010mapreduce} approach is -followed\footnote{While the operations are similar, the processing is not - distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows -for some -uniformity in the overall processing. We extract (key, document) tuples (as -TSV) from the raw JSON data and sort by key. We then group documents with the -same key and apply a function on each group in order to generate -our target schema or perform -additional operations such as deduplication or fusion of matched and unmatched references. - -The key derivation can be exact (via an identifier like DOI, PMID, etc) or -based on a value normalization, like slugifying a title string. For identifier -based matches we can generate the target schema directly. For fuzzy matching -candidates, we pass possible match pairs through a verification procedure, -which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a -domain dependent rule based verification, able to identify different versions -of a publication, preprint-published pairs and documents, which are -are similar by various metrics calculated over title and author fields. The fuzzy matching -approach is applied on all reference documents without identifier (a title is -currently required). - -With a few schema conversions, fuzzy matching can be applied to Wikipedia -articles and Open Library (edition) records as well. The aspect of precision -and recall are represented by the two stages: we are generous in the match -candidate generation phase in order to improve recall, but we are strict during -verification, in order to control precision. Quality assurance for verification is -implemented through a growing list of test cases of real examples from the catalog and -their expected or desired match status\footnote{The list can be found under: - \url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}. - It is helpful to keep this test suite independent of any specific programming language.}. - - -\section{Limitations and Future Work} - -As other dataset in this field we expect this dataset to be iterated upon. - -\begin{itemize} - \item The fatcat catalog updates its metadata - continously\footnote{A changelog can currenly be followed here: - \url{https://fatcat.wiki/changelog}} and web crawls are conducted - regularly. Current processing pipelines cover raw reference snapshot - creation and derivation of the graph structure, which allows to rerun - processing based on updated data as it becomes available. - - \item Metadata extraction from PDFs depends on supervised machine learning - models, which in turn depend on available training datasets. With additional crawls and - metadata available we hope to improve models used for metadata - extraction, improving yield and reducing data extraction artifacts in - the process. - - \item As of this version, a number of raw reference - docs remain unmatched, which means that neither exact nor fuzzy matching - has detected a link to a known entity. On the one - hand, this can hint at missing metadata. However, parts of the data - will contain a reference to a catalogued entity, but in a specific, - dense and harder to recover form. - This also include improvements to the fuzzy matching approach. - \item The reference dataset contains millions of URLs and their integration - into the graph has been implemented as prototype. A full implementation - requires a few data cleanup and normalization steps. -\end{itemize} - -\section{Acknowledgements} - -This work is partially supported by a grant from the \emph{Andrew W. Mellon - Foundation}. - - -\section{Appendix A} - - -A note on data quality: While we implement various data quality measures, -real-world data, especially coming from many different sources will contain -issues. Among other measures, we keep track of match reasons, -especially for fuzzy matching to be able to zoom in on systematic errors -more easily (see~Table~\ref{table:matches}). - -\begin{table}[] - \footnotesize - \captionsetup{font=normalsize} - \begin{center} - \begin{tabular}{@{}rlll@{}} - \toprule - \textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule - 934932865 & crossref & exact & doi \\ - 151366108 & fatcat-datacite & exact & doi \\ - 65345275 & fatcat-pubmed & exact & pmid \\ - 48778607 & fuzzy & strong & jaccardauthors \\ - 42465250 & grobid & exact & doi \\ - 29197902 & fatcat-pubmed & exact & doi \\ - 19996327 & fatcat-crossref & exact & doi \\ - 11996694 & fuzzy & strong & slugtitleauthormatch \\ - 9157498 & fuzzy & strong & tokenizedauthors \\ - 3547594 & grobid & exact & arxiv \\ - 2310025 & fuzzy & exact & titleauthormatch \\ - 1496515 & grobid & exact & pmid \\ - 680722 & crossref & strong & jaccardauthors \\ - 476331 & fuzzy & strong & versioneddoi \\ - 449271 & grobid & exact & isbn \\ - 230645 & fatcat-crossref & strong & jaccardauthors \\ - 190578 & grobid & strong & jaccardauthors \\ - 156657 & crossref & exact & isbn \\ - 123681 & fatcat-pubmed & strong & jaccardauthors \\ - 79328 & crossref & exact & arxiv \\ - 57414 & crossref & strong & tokenizedauthors \\ - 53480 & fuzzy & strong & pmiddoipair \\ - 52453 & fuzzy & strong & dataciterelatedid \\ - 47119 & grobid & strong & slugtitleauthormatch \\ - 36774 & fuzzy & strong & arxivversion \\ - % 35311 & fuzzy & strong & customieeearxiv \\ - % 33863 & grobid & exact & pmcid \\ - % 23504 & crossref & strong & slugtitleauthormatch \\ - % 22753 & fatcat-crossref & strong & tokenizedauthors \\ - % 17720 & grobid & exact & titleauthormatch \\ - % 14656 & crossref & exact & titleauthormatch \\ - % 14438 & grobid & strong & tokenizedauthors \\ - % 7682 & fatcat-crossref & exact & arxiv \\ - % 5972 & fatcat-crossref & exact & isbn \\ - % 5525 & fatcat-pubmed & exact & arxiv \\ - % 4290 & fatcat-pubmed & strong & tokenizedauthors \\ - % 2745 & fatcat-pubmed & exact & isbn \\ - % 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ - % 2273 & fatcat-crossref & strong & slugtitleauthormatch \\ - % 1960 & fuzzy & exact & workid \\ - % 1150 & fatcat-crossref & exact & titleauthormatch \\ - % 1041 & fatcat-pubmed & exact & titleauthormatch \\ - % 895 & fuzzy & strong & figshareversion \\ - % 317 & fuzzy & strong & titleartifact \\ - % 82 & grobid & strong & titleartifact \\ - % 33 & crossref & strong & titleartifact \\ - % 5 & fuzzy & strong & custombsiundated \\ - % 1 & fuzzy & strong & custombsisubdoc \\ - % 1 & fatcat & exact & doi \\ \bottomrule - \end{tabular} - \vspace*{2mm} - \caption{Table of match counts (top 25), reference provenance, match status and - match reason. The match reason identifier encode a specific rule in the domain - dependent verification process and are included for completeness - we do not - include the details of each rule in this report.} - \label{table:matches} - \end{center} -\end{table} - -\bibliographystyle{abbrv} -% \bibliographystyle{plainnat} -\bibliography{refs} -\end{document} diff --git a/docs/Simple/refs.bib b/docs/Simple/refs.bib deleted file mode 100644 index c61021e..0000000 --- a/docs/Simple/refs.bib +++ /dev/null @@ -1,228 +0,0 @@ -@inproceedings{kour2014real, - title={Real-time segmentation of on-line handwritten arabic script}, - author={Kour, George and Saabne, Raid}, - booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on}, - pages={417--422}, - year={2014}, - organization={IEEE} -} - -@inproceedings{kour2014fast, - title={Fast classification of handwritten on-line Arabic characters}, - author={Kour, George and Saabne, Raid}, - booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of}, - pages={312--318}, - year={2014}, - organization={IEEE}, - doi={10.1109/SOCPAR.2014.7008025} -} - -@article{hadash2018estimate, - title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications}, - author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon}, - journal={arXiv preprint arXiv:1804.09028}, - year={2018} -} - -@article{garfield1955citation, - title={Citation indexes for science}, - author={Garfield, Eugene}, - journal={Science}, - volume={122}, - number={3159}, - pages={108--111}, - year={1955}, - publisher={JSTOR} -} - -@inproceedings{lopez2009grobid, - title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications}, - author={Lopez, Patrice}, - booktitle={International conference on theory and practice of digital libraries}, - pages={473--474}, - year={2009}, - organization={Springer} -} - -@article{garfield2007evolution, - title={The evolution of the science citation index}, - author={Garfield, Eugene}, - journal={International microbiology}, - volume={10}, - number={1}, - pages={65}, - year={2007} -} - -@article{shotton2013publishing, - title={Publishing: open citations}, - author={Shotton, David}, - journal={Nature News}, - volume={502}, - number={7471}, - pages={295}, - year={2013} -} - -@misc{CitEc, - title = {Citations in Economics}, - howpublished = {\url{https://citec.repec.org/}}, - note = {Accessed: 2021-07-30} -} - -@inproceedings{wu2019citeseerx, - title={CiteSeerX: 20 years of service to scholarly big data}, - author={Wu, Jian and Kim, Kunho and Giles, C Lee}, - booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse}, - pages={1--4}, - year={2019} -} - -@inproceedings{li2006citeseerx, - title={CiteSeerx: an architecture and web service design for an academic document search engine}, - author={Li, Huajing and Councill, Isaac and Lee, Wang-Chien and Giles, C Lee}, - booktitle={Proceedings of the 15th international conference on World Wide Web}, - pages={883--884}, - year={2006} -} - - -@inproceedings{sinha2015overview, - title={An overview of microsoft academic service (mas) and applications}, - author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan}, - booktitle={Proceedings of the 24th international conference on world wide web}, - pages={243--246}, - year={2015} -} - -@misc{i4oc, - title = {Initiative for Open Citations}, -howpublished = {\url{https://i4oc.org/}}, -note = {Accessed: 2021-07-30} -} - -@misc{fatcatguidereferencegraph, -title = {The Fatcat Guide: Reference Graph (refcat)}, -howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}}, -note = {Accessed: 2021-08-08} -} - -@misc{crossref, -title = {Crossref}, -howpublished = {\url{https://crossref.org}}, -note = {Accessed: 2021-08-08} -} - -@misc{doaj, -title = {Directory of Open Access Journals}, -howpublished = {\url{https://doaj.org}}, -note = {Accessed: 2021-08-08} -} - -@inproceedings{ley2002dblp, - title={The DBLP computer science bibliography: Evolution, research issues, perspectives}, - author={Ley, Michael}, - booktitle={International symposium on string processing and information retrieval}, - pages={1--10}, - year={2002}, - organization={Springer} -} - - -@inproceedings{brase2009datacite, - title={DataCite-A global registration agency for research data}, - author={Brase, Jan}, - booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology}, - pages={257--261}, - year={2009}, - organization={IEEE} -} - -@article{canese2013pubmed, - title={PubMed: the bibliographic database}, - author={Canese, Kathi and Weis, Sarah}, - journal={The NCBI Handbook}, - volume={2}, - pages={1}, - year={2013}, - publisher={National Center for Biotechnology Information (US)} -} - - -@article{shotton2018funders, - title={Funders should mandate open citations.}, - author={Shotton, David}, - journal={Nature}, - volume={553}, - number={7686}, - pages={129--130}, - year={2018}, - publisher={Nature Publishing Group} -} - -@article{hutchins2021tipping, - title={A tipping point for open citation data}, - author={Hutchins, B Ian}, - journal={Quantitative Science Studies}, - pages={1--5}, - year={2021} -} - -@article{silbert1970world, - title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)}, - author={Silbert, Jeffrey M}, - journal={Criminology}, - volume={8}, - pages={107}, - year={1970}, - publisher={HeinOnline} -} - -@article{peroni2020opencitations, - title={OpenCitations, an infrastructure organization for open scholarship}, - author={Peroni, Silvio and Shotton, David}, - journal={Quantitative Science Studies}, - volume={1}, - number={1}, - pages={428--444}, - year={2020}, - publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} -} - -@article{fricke2018semantic, - title={Semantic scholar}, - author={Fricke, Suzanne}, - journal={Journal of the Medical Library Association: JMLA}, - volume={106}, - number={1}, - pages={145}, - year={2018}, - publisher={Medical Library Association} -} - -@inproceedings{tang2016aminer, - title={AMiner: Toward understanding big scholar data}, - author={Tang, Jie}, - booktitle={Proceedings of the ninth ACM international conference on web search and data mining}, - pages={467--467}, - year={2016} -} - -@article{dean2010mapreduce, - title={MapReduce: a flexible data processing tool}, - author={Dean, Jeffrey and Ghemawat, Sanjay}, - journal={Communications of the ACM}, - volume={53}, - number={1}, - pages={72--77}, - year={2010}, - publisher={ACM New York, NY, USA} -} - -@article{collet2018zstandard, - title={Zstandard Compression and the application/zstd Media Type}, - author={Collet, Yann and Kucherawy, Murray}, - journal={RFC 8478}, - year={2018} -} - diff --git a/docs/Simple/simpleConference.sty b/docs/Simple/simpleConference.sty deleted file mode 100644 index d4d4764..0000000 --- a/docs/Simple/simpleConference.sty +++ /dev/null @@ -1,136 +0,0 @@ -% --------------------------------------------------------------- -% Style file for simple, two column conference papers. -% Based on latex8.sty by Paolo.Ienne@di.epfl.ch -% --------------------------------------------------------------- -% Use with LaTeX2e as: -% \documentclass[times,10pt,twocolumn]{article} -% \usepackage{simpleConference} -% \usepackage{times} -% --------------------------------------------------------------- -% specify references as -% \bibliographystyle{simpleConference} -% \bibliography{...your files...} -% -% use Section{} and SubSection{} instead of standard section{} -% and subsection{} to obtain headings in the form -% "1.3. My heading" -% --------------------------------------------------------------- -% ten point helvetica bold required for captions -% in some sites the name of the helvetica bold font may differ, -% change the name here: -\font\tenhv = phvb at 10pt - -% eleven point times bold required for second-order headings -\font\elvbf = ptmb scaled 1100 - -% set dimensions of columns, gap between columns, and paragraph indent -\setlength{\textheight}{8.875in} -\setlength{\textwidth}{6.875in} -\setlength{\columnsep}{0.3125in} -\setlength{\topmargin}{0in} -\setlength{\headheight}{0in} -\setlength{\headsep}{0in} -\setlength{\parindent}{1pc} -\setlength{\oddsidemargin}{-.304in} -\setlength{\evensidemargin}{-.304in} - -% memento from size10.clo -% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt} -% \small{\@setfontsize\small\@ixpt{11}} -% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}} -% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt} -% \tiny{\@setfontsize\tiny\@vpt\@vipt} -% \large{\@setfontsize\large\@xiipt{14}} -% \Large{\@setfontsize\Large\@xivpt{18}} -% \LARGE{\@setfontsize\LARGE\@xviipt{22}} -% \huge{\@setfontsize\huge\@xxpt{25}} -% \Huge{\@setfontsize\Huge\@xxvpt{30}} - -\def\@maketitle - { - \newpage - \null - \vskip .375in - \begin{center} - {\Large \bf \@title \par} - % additional two empty lines at the end of the title - \vspace*{24pt} - { - \large - \lineskip .5em - \begin{tabular}[t]{c} - \@author - \end{tabular} - \par - } - % additional small space at the end of the author name - \vskip .5em - { - \large - \begin{tabular}[t]{c} - \@affiliation - \end{tabular} - \par - \ifx \@empty \@email - \else - \begin{tabular}{r@{~}l} - E-mail: & {\tt \@email} - \end{tabular} - \par - \fi - } - % additional empty line at the end of the title block - \vspace*{12pt} - \end{center} - } - -\def\abstract - {% - \centerline{\large\bf Abstract}% - \vspace*{12pt}% -% \it% %%%% iroro - commenting out italicized abstract - } - -\def\endabstract - { - % additional empty line at the end of the abstract - \vspace*{12pt} - } - -\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} - -\def\email#1{\gdef\@email{#1}} -\gdef\@email{} - -\newlength{\@ctmp} -\newlength{\@figindent} -\setlength{\@figindent}{1pc} - -\long\def\@makecaption#1#2{ - \vskip 10pt - \setbox\@tempboxa\hbox{\tenhv\noindent #1.~#2} - \setlength{\@ctmp}{\hsize} - \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent} - % IF longer than one indented paragraph line - \ifdim \wd\@tempboxa >\@ctmp - % THEN set as an indented paragraph - \begin{list}{}{\leftmargin\@figindent \rightmargin\leftmargin} - \item[]\tenhv #1.~#2\par - \end{list} - \else - % ELSE center - \hbox to\hsize{\hfil\box\@tempboxa\hfil} - \fi} - -% correct heading spacing and type -\def\section{\@startsection {section}{1}{\z@} - {14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}} -\def\subsection{\@startsection {subsection}{2}{\z@} - {13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}} - -% add the period after section numbers -\newcommand{\Section}[1]{\section{\hskip -1em.~#1}} -\newcommand{\SubSection}[1]{\subsection{\hskip -1em.~#1}} - -% end of file latex8.sty -% --------------------------------------------------------------- diff --git a/docs/TR-20210730212057-IA-WDS-CG/.gitignore b/docs/TR-20210730212057-IA-WDS-CG/.gitignore deleted file mode 100644 index 5040d53..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -*.log -*.aux -*.bbl -*.blg -*.out diff --git a/docs/TR-20210730212057-IA-WDS-CG/Makefile b/docs/TR-20210730212057-IA-WDS-CG/Makefile deleted file mode 100644 index 9996575..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -main.pdf: main.tex - pdflatex main.tex - bibtex main - pdflatex main.tex - - -.PHONY: clean -clean: - rm -f main.pdf diff --git a/docs/TR-20210730212057-IA-WDS-CG/README.md b/docs/TR-20210730212057-IA-WDS-CG/README.md deleted file mode 100644 index 54de590..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/README.md +++ /dev/null @@ -1,49 +0,0 @@ - -## Description: - -The project hosts an aesthetic and simple LaTeX style suitable for "preprint" publications such as arXiv and bio-arXiv, etc. -It is based on the [**nips_2018.sty**](https://media.nips.cc/Conferences/NIPS2018/Styles/nips_2018.sty) style. - -This styling maintains the esthetic of NIPS but adding and changing features to make it (IMO) even better and more suitable for preprints. -The result looks fairly different from NIPS style so that readers won't get confused to think that the preprint was published in NIPS. - -### Why NIPS? -Because the NIPS styling is a comfortable single column format that is very esthetic and convenient for reading. - -## Usage: -1. Use Document class **article**. -2. Copy **arxiv.sty** to the folder containing your tex file. -3. add `\usepackage{arxiv}` after `\documentclass{article}`. -4. The only packages used in the style file are **geometry** and **fancyheader**. Do not reimport them. - -See **template.tex** - -## Project files: -1. **arxiv.sty** - the style file. -2. **template.tex** - a sample template that uses the **arxiv style**. -3. **references.bib** - the bibliography source file for template.tex. -4. **template.pdf** - a sample output of the template file that demonstrated the design provided by the arxiv style. - - -## Handling References when submitting to arXiv.org -The most convenient way to manage references is using an external BibTeX file and pointing to it from the main file. -However, this requires running the [bibtex](http://www.bibtex.org/) tool to "compile" the `.bib` file and create `.bbl` file containing "bibitems" that can be directly inserted in the main tex file. -However, unfortunately the arXiv Tex environment ([Tex Live](https://www.tug.org/texlive/)) do not do that. -So easiest way when submitting to arXiv is to create a single self-contained .tex file that contains the references. -This can be done by running the BibTeX command on your machine and insert the content of the generated `.bbl` file into the `.tex` file and commenting out the `\bibliography{references}` that point to the external references file. - -Below are the commands that should be run in the project folder: -1. Run `$ latex template` -2. Run `$ bibtex template` -3. A `template.bbl` file will be generated (make sure it is there) -4. Copy the `template.bbl` file content to `template.tex` into the `\begin{thebibliography}` command. -5. Comment out the `\bibliography{references}` command in `template.tex`. -6. You ready to submit to arXiv.org. - - -## General Notes: -1. For help, comments, praises, bug reporting or change requests, you can contact the author at: kourgeorge/at/gmail.com. -2. You can use, redistribute and do whatever with this project, however, the author takes no responsibility on whatever usage of this project. -3. If you start another project based on this project, it would be nice to mention/link to this project. -4. You are very welcome to contribute to this project. -5. A good looking 2 column template can be found in https://github.com/brenhinkeller/preprint-template.tex. diff --git a/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty b/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty deleted file mode 100644 index ccb7feb..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/arxiv.sty +++ /dev/null @@ -1,262 +0,0 @@ -\NeedsTeXFormat{LaTeX2e} - -\ProcessOptions\relax - -% fonts -\renewcommand{\rmdefault}{ptm} -\renewcommand{\sfdefault}{phv} - -% set page geometry -\usepackage[verbose=true,letterpaper]{geometry} -\AtBeginDocument{ - \newgeometry{ - textheight=9in, - textwidth=6.5in, - top=1in, - headheight=14pt, - headsep=25pt, - footskip=30pt - } -} - -\widowpenalty=10000 -\clubpenalty=10000 -\flushbottom -\sloppy - - - -\newcommand{\headeright}{A Preprint} -\newcommand{\undertitle}{A Preprint} -\newcommand{\shorttitle}{\@title} - -\usepackage{fancyhdr} -\fancyhf{} -\pagestyle{fancy} -\renewcommand{\headrulewidth}{0.4pt} -\fancyheadoffset{0pt} -\rhead{\scshape \footnotesize \headeright} -\chead{\shorttitle} -\cfoot{\thepage} - - -%Handling Keywords -\def\keywordname{{\bfseries \emph{Keywords}}}% -\def\keywords#1{\par\addvspace\medskipamount{\rightskip=0pt plus1cm -\def\and{\ifhmode\unskip\nobreak\fi\ $\cdot$ -}\noindent\keywordname\enspace\ignorespaces#1\par}} - -% font sizes with reduced leading -\renewcommand{\normalsize}{% - \@setfontsize\normalsize\@xpt\@xipt - \abovedisplayskip 7\p@ \@plus 2\p@ \@minus 5\p@ - \abovedisplayshortskip \z@ \@plus 3\p@ - \belowdisplayskip \abovedisplayskip - \belowdisplayshortskip 4\p@ \@plus 3\p@ \@minus 3\p@ -} -\normalsize -\renewcommand{\small}{% - \@setfontsize\small\@ixpt\@xpt - \abovedisplayskip 6\p@ \@plus 1.5\p@ \@minus 4\p@ - \abovedisplayshortskip \z@ \@plus 2\p@ - \belowdisplayskip \abovedisplayskip - \belowdisplayshortskip 3\p@ \@plus 2\p@ \@minus 2\p@ -} -\renewcommand{\footnotesize}{\@setfontsize\footnotesize\@ixpt\@xpt} -\renewcommand{\scriptsize}{\@setfontsize\scriptsize\@viipt\@viiipt} -\renewcommand{\tiny}{\@setfontsize\tiny\@vipt\@viipt} -\renewcommand{\large}{\@setfontsize\large\@xiipt{14}} -\renewcommand{\Large}{\@setfontsize\Large\@xivpt{16}} -\renewcommand{\LARGE}{\@setfontsize\LARGE\@xviipt{20}} -\renewcommand{\huge}{\@setfontsize\huge\@xxpt{23}} -\renewcommand{\Huge}{\@setfontsize\Huge\@xxvpt{28}} - -% sections with less space -\providecommand{\section}{} -\renewcommand{\section}{% - \@startsection{section}{1}{\z@}% - {-2.0ex \@plus -0.5ex \@minus -0.2ex}% - { 1.5ex \@plus 0.3ex \@minus 0.2ex}% - {\large\bf\raggedright}% -} -\providecommand{\subsection}{} -\renewcommand{\subsection}{% - \@startsection{subsection}{2}{\z@}% - {-1.8ex \@plus -0.5ex \@minus -0.2ex}% - { 0.8ex \@plus 0.2ex}% - {\normalsize\bf\raggedright}% -} -\providecommand{\subsubsection}{} -\renewcommand{\subsubsection}{% - \@startsection{subsubsection}{3}{\z@}% - {-1.5ex \@plus -0.5ex \@minus -0.2ex}% - { 0.5ex \@plus 0.2ex}% - {\normalsize\bf\raggedright}% -} -\providecommand{\paragraph}{} -\renewcommand{\paragraph}{% - \@startsection{paragraph}{4}{\z@}% - {1.5ex \@plus 0.5ex \@minus 0.2ex}% - {-1em}% - {\normalsize\bf}% -} -\providecommand{\subparagraph}{} -\renewcommand{\subparagraph}{% - \@startsection{subparagraph}{5}{\z@}% - {1.5ex \@plus 0.5ex \@minus 0.2ex}% - {-1em}% - {\normalsize\bf}% -} -\providecommand{\subsubsubsection}{} -\renewcommand{\subsubsubsection}{% - \vskip5pt{\noindent\normalsize\rm\raggedright}% -} - -% float placement -\renewcommand{\topfraction }{0.85} -\renewcommand{\bottomfraction }{0.4} -\renewcommand{\textfraction }{0.1} -\renewcommand{\floatpagefraction}{0.7} - -\newlength{\@abovecaptionskip}\setlength{\@abovecaptionskip}{7\p@} -\newlength{\@belowcaptionskip}\setlength{\@belowcaptionskip}{\z@} - -\setlength{\abovecaptionskip}{\@abovecaptionskip} -\setlength{\belowcaptionskip}{\@belowcaptionskip} - -% swap above/belowcaptionskip lengths for tables -\renewenvironment{table} - {\setlength{\abovecaptionskip}{\@belowcaptionskip}% - \setlength{\belowcaptionskip}{\@abovecaptionskip}% - \@float{table}} - {\end@float} - -% footnote formatting -\setlength{\footnotesep }{6.65\p@} -\setlength{\skip\footins}{9\p@ \@plus 4\p@ \@minus 2\p@} -\renewcommand{\footnoterule}{\kern-3\p@ \hrule width 12pc \kern 2.6\p@} -\setcounter{footnote}{0} - -% paragraph formatting -\setlength{\parindent}{\z@} -\setlength{\parskip }{5.5\p@} - -% list formatting -\setlength{\topsep }{4\p@ \@plus 1\p@ \@minus 2\p@} -\setlength{\partopsep }{1\p@ \@plus 0.5\p@ \@minus 0.5\p@} -\setlength{\itemsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} -\setlength{\parsep }{2\p@ \@plus 1\p@ \@minus 0.5\p@} -\setlength{\leftmargin }{3pc} -\setlength{\leftmargini }{\leftmargin} -\setlength{\leftmarginii }{2em} -\setlength{\leftmarginiii}{1.5em} -\setlength{\leftmarginiv }{1.0em} -\setlength{\leftmarginv }{0.5em} -\def\@listi {\leftmargin\leftmargini} -\def\@listii {\leftmargin\leftmarginii - \labelwidth\leftmarginii - \advance\labelwidth-\labelsep - \topsep 2\p@ \@plus 1\p@ \@minus 0.5\p@ - \parsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ - \itemsep \parsep} -\def\@listiii{\leftmargin\leftmarginiii - \labelwidth\leftmarginiii - \advance\labelwidth-\labelsep - \topsep 1\p@ \@plus 0.5\p@ \@minus 0.5\p@ - \parsep \z@ - \partopsep 0.5\p@ \@plus 0\p@ \@minus 0.5\p@ - \itemsep \topsep} -\def\@listiv {\leftmargin\leftmarginiv - \labelwidth\leftmarginiv - \advance\labelwidth-\labelsep} -\def\@listv {\leftmargin\leftmarginv - \labelwidth\leftmarginv - \advance\labelwidth-\labelsep} -\def\@listvi {\leftmargin\leftmarginvi - \labelwidth\leftmarginvi - \advance\labelwidth-\labelsep} - -% create title -\providecommand{\maketitle}{} -\renewcommand{\maketitle}{% - \par - \begingroup - \renewcommand{\thefootnote}{\fnsymbol{footnote}} - % for perfect author name centering - \renewcommand{\@makefnmark}{\hbox to \z@{$^{\@thefnmark}$\hss}} - % The footnote-mark was overlapping the footnote-text, - % added the following to fix this problem (MK) - \long\def\@makefntext##1{% - \parindent 1em\noindent - \hbox to 1.8em{\hss $\m@th ^{\@thefnmark}$}##1 - } - \thispagestyle{empty} - \@maketitle - \@thanks - %\@notice - \endgroup - \let\maketitle\relax - \let\thanks\relax -} - -% rules for title box at top of first page -\newcommand{\@toptitlebar}{ - \hrule height 2\p@ - \vskip 0.25in - \vskip -\parskip% -} -\newcommand{\@bottomtitlebar}{ - \vskip 0.29in - \vskip -\parskip - \hrule height 2\p@ - \vskip 0.09in% -} - -% create title (includes both anonymized and non-anonymized versions) -\providecommand{\@maketitle}{} -\renewcommand{\@maketitle}{% - \vbox{% - \hsize\textwidth - \linewidth\hsize - \vskip 0.1in - \@toptitlebar - \centering - {\LARGE\sc \@title\par} - \@bottomtitlebar - \textsc{\undertitle}\\ - \vskip 0.1in - \def\And{% - \end{tabular}\hfil\linebreak[0]\hfil% - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% - } - \def\AND{% - \end{tabular}\hfil\linebreak[4]\hfil% - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\ignorespaces% - } - \begin{tabular}[t]{c}\bf\rule{\z@}{24\p@}\@author\end{tabular}% - \vskip 0.4in \@minus 0.1in \center{\@date} \vskip 0.2in - } -} - -% add conference notice to bottom of first page -\newcommand{\ftype@noticebox}{8} -\newcommand{\@notice}{% - % give a bit of extra room back to authors on first page - \enlargethispage{2\baselineskip}% - \@float{noticebox}[b]% - \footnotesize\@noticestring% - \end@float% -} - -% abstract styling -\renewenvironment{abstract} -{ - \centerline - {\large \bfseries \scshape Abstract} - \begin{quote} -} -{ - \end{quote} -} - -\endinput diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.pdf b/docs/TR-20210730212057-IA-WDS-CG/main.pdf deleted file mode 100644 index c8bb5a3..0000000 Binary files a/docs/TR-20210730212057-IA-WDS-CG/main.pdf and /dev/null differ diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex deleted file mode 100644 index a7edac3..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/main.tex +++ /dev/null @@ -1,442 +0,0 @@ -\documentclass{article} - - - -\usepackage{arxiv} - -\usepackage[utf8]{inputenc} % allow utf-8 input -\usepackage[T1]{fontenc} % use 8-bit T1 fonts -\usepackage{hyperref} % hyperlinks -\usepackage{url} % simple URL typesetting -\usepackage{booktabs} % professional-quality tables -\usepackage{amsfonts} % blackboard math symbols -\usepackage{nicefrac} % compact symbols for 1/2, etc. -\usepackage{microtype} % microtypography -\usepackage{lipsum} % Can be removed after putting your text content -\usepackage{graphicx} -\usepackage{natbib} -\usepackage{doi} - -\title{Internet Archive Scholar Citation Graph Dataset} - -\date{August 10, 2021} % Here you can change the date presented in the paper title -%\date{} % Or removing it - -\author{ Martin Czygan \\ - Internet Archive\\ - San Francisco, CA 94118 \\ - \texttt{martin@archive.org} \\ - %% examples of more authors - \And - Bryan Newbold \\ - Internet Archive\\ - San Francisco, CA 94118 \\ - \texttt{bnewbold@archive.org} \\ - % \And - % Helge Holzmann \\ - % Internet Archive\\ - % San Francisco, CA 94118 \\ - % \texttt{helge@archive.org} \\ - % \And - % Jefferson Bailey \\ - % Internet Archive\\ - % San Francisco, CA 94118 \\ - % \texttt{jefferson@archive.org} \\ - %% \AND - %% Coauthor \\ - %% Affiliation \\ - %% Address \\ - %% \texttt{email} \\ - %% \And - %% Coauthor \\ - %% Affiliation \\ - %% Address \\ - %% \texttt{email} \\ - %% \And - %% Coauthor \\ - %% Affiliation \\ - %% Address \\ - %% \texttt{email} \\ -} - -% Uncomment to remove the date -%\date{} - -% Uncomment to override the `A preprint' in the header -\renewcommand{\headeright}{Technical Report} -\renewcommand{\undertitle}{Technical Report} -% \renewcommand{\shorttitle}{\textit{arXiv} Template} - -%%% Add PDF metadata to help others organize their library -%%% Once the PDF is generated, you can check the metadata with -%%% $ pdfinfo template.pdf -\hypersetup{ -pdftitle={Internet Archive Scholar Citation Graph Dataset}, -pdfsubject={cs.DL, cs.IR}, -pdfauthor={Martin Czygan, Bryan Newbold, Helge Holzmann, Jefferson Bailey}, -pdfkeywords={Web Archiving, Citation Graph}, -} - -\begin{document} -\maketitle - -\begin{abstract} -As part of its scholarly data efforts, the Internet Archive releases a citation -graph dataset derived from scholarly publications and additional data sources. It is -composed of data gathered by the \href{https://fatcat.wiki}{fatcat cataloging project} and related -web-scale crawls targeting primary and secondary scholarly outputs. In -addition, relations are worked out between scholarly publications, web pages -and their archived copies, books from the Open Library project as well as -Wikipedia articles. This first version of the graph consists of over X nodes -and over Y edges. We release this dataset under a Z open license under the -collection at \href{https://archive.org/details/TODO-citation\_graph}{https://archive.org/details/TODO-citation\_graph}, as well as all code -used for derivation under an MIT license. -\end{abstract} - - -% keywords can be removed -\keywords{Citation Graph \and Scholarly Communications \and Web Archiving} - - -\section{Introduction} - -The Internet Archive releases a first version of a citation graph dataset -derived from a raw corpus of about 2.5B references gathered from metadata and -from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}. -The goal of this report is to describe briefly the current contents and the -derivation of the Archive Scholar Citations Dataset (ASC). We expect -this dataset to be iterated upon, with changes both in content and processing. - -Modern citation indexes can be traced back to the early computing age, when -projects like the Science Citation Index (1955)\citep{garfield2007evolution} -were first devised, living on in existing commercial knowledge bases today. -Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 -- the first version of which contained 6,325,178 individual -references\citep{shotton2013publishing}. Other notable sources from that time -include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last -decade has seen an increase of more openly available reference dataset and -citation projects, like Microsoft Academic\citep{sinha2015overview} and -Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021, -according to \citep{hutchins2021tipping} over 1B citations are publicly -available, marking a tipping point for open citations. - - - -\section{Citation Graph Contents} - - - -% * edges -% * edges exact -% * edges fuzzy -% * edges fuzzy reason (table) -% * number of source docs -% * number of target docs -% * refs to papers -% * refs to books -% * refs to web pages -% * refs to web pages that have been archived -% * refs to web pages that have been archived but not on liveweb any more -% -% Overlaps -% -% * how many edges can be found in COCI as well -% * how many edges can be found in MAG as well -% * how many unique to us edges -% -% Additional numbers -% -% * number of unparsed refs -% * "biblio" field distribution of unparted refs -% -% Potential routes -% -% * journal abbreviation parsing with suffix arrays -% * lookup by name, year and journal - - -\section{System Design} - -The constraints for the systems design are informed by the volume and the -variety of the data. In total, the raw inputs amount to a few TB of textual -content, mostly newline delimited JSON. More importantly, while the number of -data fields is low, certain schemas are very partial with hundreds of different -combinations of available field values found in the raw reference data. This is -most likely caused by aggregators passing on reference data coming from -hundreds of sources, each of which not necessarily agreeing on a common -granularity for citation data and from artifacts of machine learning based -structured data extraction tools. - -Each combination of fields may require a slightly different processing path. -For example, references with an Arxiv identifier can be processed differently -from references with only a title. Over 50\% of the raw reference data comes -from a set of eight field manifestations, as listed in -Table~\ref{table:fields}. - -\begin{table}[] - \begin{center} - \begin{tabular}{ll} -\toprule - \bf{Fields} & \bf{Share} \\ -\midrule - \multicolumn{1}{l}{CN|CRN|P|T|U|V|Y} & 14\% \\ - \multicolumn{1}{l}{DOI} & 14\% \\ - \multicolumn{1}{l}{CN|CRN|IS|P|T|U|V|Y} & 5\% \\ - \multicolumn{1}{l}{CN|CRN|DOI|U|V|Y} & 4\% \\ - \multicolumn{1}{l}{PMID|U} & 4\% \\ - \multicolumn{1}{l}{CN|CRN|DOI|T|V|Y} & 4\% \\ - \multicolumn{1}{l}{CN|CRN|Y} & 4\% \\ - \multicolumn{1}{l}{CN|CRN|DOI|V|Y} & 4\% \\ - \end{tabular} - \vspace*{2mm} - \caption{Top 8 combinations of available fields in raw reference data - accounting for about 53\% of the total data (CN = container name, CRN = -contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = -issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value.} - \label{table:fields} -\end{center} -\end{table} - -Overall, a map-reduce style approach is followed, which allows for some -uniformity in the overall processing. We extract (key, document) tuples (as -TSV) from the raw JSON data and sort by key. Then we group documents with the -same key into groups and apply a function on each group in order to generate -our target schema (currently named biblioref, or bref for short) or perform -addition operations (such as deduplication). - -The key derivation can be exact (like an identifier like DOI, PMID, etc) or -based on a normalization procedure, like a slugified title string. For -identifier based matches we can generate the target biblioref schema directly. -For fuzzy matching candidates, we pass possible match pairs through a -verification procedure, which is implemented for release entity schema pairs. -The current verification procedure is a domain dependent rule based -verification, able to identify different versions of a publication, -preprint-published pairs or or other kind of similar documents by calculating -similarity metrics across title and authors. The fuzzy matching approach is -applied on all reference documents, which only have a title, but no identifier. - -With a few schema conversions, fuzzy matching can be applied to Wikipedia -articles and Open Library (edition) records as well. The aspect of precision -and recall are represented by the two stages: we are generous in the match -candidate generation phase in order to improve recall, but we are strict during -verification, in order to control precision. - -\section{Fuzzy Matching Approach} - -% Take sample of 100 docs, report some precision, recall, F1 on a hand curated -% small subset. - -The fuzzy matching approach currently implemented works in two phases: match -candidate generation and verification. For candidate generation, we map each -document to a key. We implemented a number of algorithms to form these -clusters, e.g. title normalizations (including lowercasing, whitespace removal, -unicode normalization and other measures) or transformations like -NYSIIS\citep{silbert1970world}. - -The verification approach is based on a set of rules, which are tested -sequentially, yielding a match signal from weak to exact. We use a suite of -over 300 manually curated match examples\footnote{The table can be found here: -\href{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}} -as part of a unit test suite to allow for a controlled, continuous adjustement -to the verification procedure. If the verification yields either an exact or -strong signal, we include consider it a match. - -We try to keep the processing steps performant to keep the overall derivation -time limited. Map and reduce operations are parallelized and certain processing -steps can process 100K documents per second or even more on commodity hardware -with spinning disks. - -\section{Quality Assurance} - -Understanding data quality plays a role, as the data is coming from a myriad of -sources, each with possible idiosyncratic features or missing values. We employ -a few QA measures during the process. First, we try to pass each data item -through only one processing pipeline (e.g. items matched by any identifier -should not even be considered for fuzzy matching). If duplicate links appear in -the final dataset nonetheless, we remove them, prefering exact over fuzzy matches. - -We employ a couple of data cleaning techniques, e.g. to find and verify -identifiers like ISBN or to sanitize URLs found in the data. Many of these -artifacts stem from the fact that large chunks of the raw data come from -heuristic data extraction from PDF documents. - - -\section{Discussion} - -% need to iterate - -%\lipsum[2] %\lipsum[3] - - -% \section{Headings: first level} % \label{sec:headings} -% -% \lipsum[4] See Section \ref{sec:headings}. -% -% \subsection{Headings: second level} -% \lipsum[5] -% \begin{equation} -% \xi _{ij}(t)=P(x_{t}=i,x_{t+1}=j|y,v,w;\theta)= {\frac {\alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}{\sum _{i=1}^{N} \sum _{j=1}^{N} \alpha _{i}(t)a^{w_t}_{ij}\beta _{j}(t+1)b^{v_{t+1}}_{j}(y_{t+1})}} -% \end{equation} -% -% \subsubsection{Headings: third level} -% \lipsum[6] -% -% \paragraph{Paragraph} -% \lipsum[7] -% -% -% -% \section{Examples of citations, figures, tables, references} -% \label{sec:others} -% -% \subsection{Citations} -% Citations use \verb+natbib+. The documentation may be found at -% \begin{center} -% \url{http://mirrors.ctan.org/macros/latex/contrib/natbib/natnotes.pdf} -% \end{center} -% -% Here is an example usage of the two main commands (\verb+citet+ and \verb+citep+): Some people thought a thing \citep{kour2014real, hadash2018estimate} but other people thought something else \citep{kour2014fast}. Many people have speculated that if we knew exactly why \citet{kour2014fast} thought this\dots -% -% \subsection{Figures} -% \lipsum[10] -% See Figure \ref{fig:fig1}. Here is how you add footnotes. \footnote{Sample of the first footnote.} -% \lipsum[11] -% -% \begin{figure} -% \centering -% \fbox{\rule[-.5cm]{4cm}{4cm} \rule[-.5cm]{4cm}{0cm}} -% \caption{Sample figure caption.} -% \label{fig:fig1} -% \end{figure} -% -% \subsection{Tables} -% See awesome Table~\ref{tab:table}. -% -% The documentation for \verb+booktabs+ (`Publication quality tables in LaTeX') is available from: -% \begin{center} -% \url{https://www.ctan.org/pkg/booktabs} -% \end{center} -% -% -% \begin{table} -% \caption{Sample table title} -% \centering -% \begin{tabular}{lll} -% \toprule -% \multicolumn{2}{c}{Part} \\ -% \cmidrule(r){1-2} -% Name & Description & Size ($\mu$m) \\ -% \midrule -% Dendrite & Input terminal & $\sim$100 \\ -% Axon & Output terminal & $\sim$10 \\ -% Soma & Cell body & up to $10^6$ \\ -% \bottomrule -% \end{tabular} -% \label{tab:table} -% \end{table} -% -% \subsection{Lists} -% \begin{itemize} -% \item Lorem ipsum dolor sit amet -% \item consectetur adipiscing elit. -% \item Aliquam dignissim blandit est, in dictum tortor gravida eget. In ac rutrum magna. -% \end{itemize} - - -\bibliographystyle{unsrtnat} -\bibliography{references} %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) . - - -%%% Uncomment this section and comment out the \bibliography{references} line above to use inline references. -% \begin{thebibliography}{1} - -% \bibitem{kour2014real} -% George Kour and Raid Saabne. -% \newblock Real-time segmentation of on-line handwritten arabic script. -% \newblock In {\em Frontiers in Handwriting Recognition (ICFHR), 2014 14th -% International Conference on}, pages 417--422. IEEE, 2014. - -% \bibitem{kour2014fast} -% George Kour and Raid Saabne. -% \newblock Fast classification of handwritten on-line arabic characters. -% \newblock In {\em Soft Computing and Pattern Recognition (SoCPaR), 2014 6th -% International Conference of}, pages 312--318. IEEE, 2014. - -% \bibitem{hadash2018estimate} -% Guy Hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George Kour, and Alon -% Jacovi. -% \newblock Estimate and replace: A novel approach to integrating deep neural -% networks with existing applications. -% \newblock {\em arXiv preprint arXiv:1804.09028}, 2018. - -% \end{thebibliography} - -\section{Appendix} - -% Please add the following required packages to your document preamble: -\begin{table}[] - \begin{center} -\begin{tabular}{@{}rlll@{}} -\toprule -\textbf{Number of matches} & \textbf{Citation Provenance} & \textbf{Match Status} & \textbf{Match Reason} \\ \midrule -934932865 & crossref & exact & doi \\ -151366108 & fatcat-datacite & exact & doi \\ -65345275 & fatcat-pubmed & exact & pmid \\ -48778607 & fuzzy & strong & jaccardauthors \\ -42465250 & grobid & exact & doi \\ -29197902 & fatcat-pubmed & exact & doi \\ -19996327 & fatcat-crossref & exact & doi \\ -11996694 & fuzzy & strong & slugtitleauthormatch \\ -9157498 & fuzzy & strong & tokenizedauthors \\ -3547594 & grobid & exact & arxiv \\ -2310025 & fuzzy & exact & titleauthormatch \\ -1496515 & grobid & exact & pmid \\ -680722 & crossref & strong & jaccardauthors \\ -476331 & fuzzy & strong & versioneddoi \\ -449271 & grobid & exact & isbn \\ -230645 & fatcat-crossref & strong & jaccardauthors \\ -190578 & grobid & strong & jaccardauthors \\ -156657 & crossref & exact & isbn \\ -123681 & fatcat-pubmed & strong & jaccardauthors \\ -79328 & crossref & exact & arxiv \\ -57414 & crossref & strong & tokenizedauthors \\ -53480 & fuzzy & strong & pmiddoipair \\ -52453 & fuzzy & strong & dataciterelatedid \\ -47119 & grobid & strong & slugtitleauthormatch \\ -36774 & fuzzy & strong & arxivversion \\ -35311 & fuzzy & strong & customieeearxiv \\ -33863 & grobid & exact & pmcid \\ -23504 & crossref & strong & slugtitleauthormatch \\ -22753 & fatcat-crossref & strong & tokenizedauthors \\ -17720 & grobid & exact & titleauthormatch \\ -14656 & crossref & exact & titleauthormatch \\ -14438 & grobid & strong & tokenizedauthors \\ -7682 & fatcat-crossref & exact & arxiv \\ -5972 & fatcat-crossref & exact & isbn \\ -5525 & fatcat-pubmed & exact & arxiv \\ -4290 & fatcat-pubmed & strong & tokenizedauthors \\ -2745 & fatcat-pubmed & exact & isbn \\ -2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ -2273 & fatcat-crossref & strong & slugtitleauthormatch \\ -1960 & fuzzy & exact & workid \\ -1150 & fatcat-crossref & exact & titleauthormatch \\ -1041 & fatcat-pubmed & exact & titleauthormatch \\ -895 & fuzzy & strong & figshareversion \\ -317 & fuzzy & strong & titleartifact \\ -82 & grobid & strong & titleartifact \\ -33 & crossref & strong & titleartifact \\ -5 & fuzzy & strong & custombsiundated \\ -1 & fuzzy & strong & custombsisubdoc \\ -1 & fatcat & exact & doi \\ \bottomrule -\end{tabular} - \vspace*{2mm} - \caption{Table of match counts, reference provenance, match status and -match reason. The match reason identifier encode a specific rule in the domain -dependent verification process and are included for completeness - we do not -include the details of each rule in this report.} - \label{table:fields} -\end{center} -\end{table} - - -\end{document} diff --git a/docs/TR-20210730212057-IA-WDS-CG/references.bib b/docs/TR-20210730212057-IA-WDS-CG/references.bib deleted file mode 100644 index bcb8a16..0000000 --- a/docs/TR-20210730212057-IA-WDS-CG/references.bib +++ /dev/null @@ -1,123 +0,0 @@ -@inproceedings{kour2014real, - title={Real-time segmentation of on-line handwritten arabic script}, - author={Kour, George and Saabne, Raid}, - booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on}, - pages={417--422}, - year={2014}, - organization={IEEE} -} - -@inproceedings{kour2014fast, - title={Fast classification of handwritten on-line Arabic characters}, - author={Kour, George and Saabne, Raid}, - booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of}, - pages={312--318}, - year={2014}, - organization={IEEE}, - doi={10.1109/SOCPAR.2014.7008025} -} - -@article{hadash2018estimate, - title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications}, - author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon}, - journal={arXiv preprint arXiv:1804.09028}, - year={2018} -} - -@article{garfield1955citation, - title={Citation indexes for science}, - author={Garfield, Eugene}, - journal={Science}, - volume={122}, - number={3159}, - pages={108--111}, - year={1955}, - publisher={JSTOR} -} - -@inproceedings{lopez2009grobid, - title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications}, - author={Lopez, Patrice}, - booktitle={International conference on theory and practice of digital libraries}, - pages={473--474}, - year={2009}, - organization={Springer} -} - -@article{garfield2007evolution, - title={The evolution of the science citation index}, - author={Garfield, Eugene}, - journal={International microbiology}, - volume={10}, - number={1}, - pages={65}, - year={2007} -} - -@article{shotton2013publishing, - title={Publishing: open citations}, - author={Shotton, David}, - journal={Nature News}, - volume={502}, - number={7471}, - pages={295}, - year={2013} -} - -@misc{CitEc, - title = {Citations in Economics}, - howpublished = {\url{https://citec.repec.org/}}, - note = {Accessed: 2021-07-30} -} - -@inproceedings{wu2019citeseerx, - title={CiteSeerX: 20 years of service to scholarly big data}, - author={Wu, Jian and Kim, Kunho and Giles, C Lee}, - booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse}, - pages={1--4}, - year={2019} -} - -@inproceedings{sinha2015overview, - title={An overview of microsoft academic service (mas) and applications}, - author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan}, - booktitle={Proceedings of the 24th international conference on world wide web}, - pages={243--246}, - year={2015} -} - -@misc{i4oc, - title = {Initiative for Open Citations}, -howpublished = {\url{https://i4oc.org/}}, -note = {Accessed: 2021-07-30} -} - -@article{shotton2018funders, - title={Funders should mandate open citations.}, - author={Shotton, David}, - journal={Nature}, - volume={553}, - number={7686}, - pages={129--130}, - year={2018}, - publisher={Nature Publishing Group} -} - -@article{hutchins2021tipping, - title={A tipping point for open citation data}, - author={Hutchins, B Ian}, - journal={Quantitative Science Studies}, - pages={1--5}, - year={2021} -} - -@article{silbert1970world, - title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)}, - author={Silbert, Jeffrey M}, - journal={Criminology}, - volume={8}, - pages={107}, - year={1970}, - publisher={HeinOnline} -} - diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore new file mode 100644 index 0000000..5040d53 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/.gitignore @@ -0,0 +1,5 @@ +*.log +*.aux +*.bbl +*.blg +*.out diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE new file mode 100644 index 0000000..9f5c70f --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Ruoho Ruotsi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile new file mode 100644 index 0000000..11264f8 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/Makefile @@ -0,0 +1,17 @@ +main.pdf: main.tex + latexindent -w main.tex && rm -f main.bak* + pdflatex main.tex + bibtex main + pdflatex main.tex + pdflatex main.tex + + +.PHONY: clean +clean: + rm -f main.pdf + rm -f main.aux + rm -f main.log + rm -f main.bbl + rm -f main.blg + rm -f main.out + diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/README.md b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md new file mode 100644 index 0000000..3a56517 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/README.md @@ -0,0 +1,2 @@ +# latex-template-arxiv-preprint +A simple LaTeX template for Technical Reports, arXiv preprints & 2-column Conference papers diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf new file mode 100644 index 0000000..b21876a Binary files /dev/null and b/docs/TR-20210808100000-IA-WDS-REFCAT/figure.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf new file mode 100644 index 0000000..3b431cc Binary files /dev/null and b/docs/TR-20210808100000-IA-WDS-REFCAT/main.pdf differ diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex new file mode 100644 index 0000000..e4febd9 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/main.tex @@ -0,0 +1,362 @@ +\documentclass[hidelinks,10pt,twocolumn]{article} +\usepackage{simpleConference} +\usepackage[utf8]{inputenc} +\usepackage{times} +\usepackage{graphicx} +\usepackage{natbib} +\usepackage{doi} +\usepackage{amssymb} +\usepackage{url,hyperref} +\usepackage{booktabs} % professional-quality tables +\usepackage{amsfonts} % blackboard math symbols +\usepackage{nicefrac} % compact symbols for 1/2, etc. +\usepackage{caption} + +\usepackage{datetime} +\providecommand{\keywords}[1]{\textbf{\textit{Index terms---}} #1} +\setlength{\parindent}{0pt} + +\begin{document} + +\title{Fatcat Reference Dataset} + +\author{Martin Czygan \\ + \\ + Internet Archive \\ + San Francisco, California, USA \\ + martin@archive.org \\ + \and + Bryan Newbold \\ + \\ + Internet Archive \\ + San Francisco, California, USA \\ + bnewbold@archive.org \\ + \\ +} + + +\maketitle +\thispagestyle{empty} + + +\begin{abstract} + As part of its scholarly data efforts, the Internet Archive releases a first version of a citation + graph dataset, named \emph{refcat}, derived from scholarly publications and + additional data sources. It is composed of data gathered by the fatcat + cataloging project\footnote{\url{https://fatcat.wiki}}, related web-scale + crawls targeting primary and secondary scholarly outputs, as well as metadata + from the Open Library\footnote{\url{https://openlibrary.org}} project and + Wikipedia\footnote{\url{https://wikipedia.org}}. This first version of the + graph consists of 1,323,423,672 citations. We release this dataset under a CC0 + Public Domain Dedication, accessible through an archive + item\footnote{\url{https://archive.org/details/refcat_2021-07-28}}. All + code used in the derivation process is released under an MIT + license\footnote{\url{https://gitlab.com/internetarchive/cgraph}}. +\end{abstract} + +\keywords{Citation Graph, Web Archiving} + +\section{Introduction} + + +The Internet Archive releases a first version of a citation graph dataset +derived from a raw corpus of about 2.5B references gathered from metadata and +data obtained by PDF extraction tools such as +GROBID\cite{lopez2009grobid}. Additionally, we consider integration with +metadata from Open Library and Wikipedia. +The goal of this report is to describe briefly the current contents and the +derivation of the dataset. We expect +this dataset to be iterated upon, with changes both in content and processing. + +Modern citation indexes can be traced back to the early computing age, when +projects like the Science Citation Index (1955)\citep{garfield2007evolution} +were first devised, living on in existing commercial knowledge bases today. +Open alternatives were started such as the Open Citations Corpus (OCC) in 2010 +- the first version of which contained 6,325,178 individual +references\citep{shotton2013publishing}. Other notable early projects +include CiteSeerX\citep{wu2019citeseerx} and CitEc\citep{CitEc}. The last +decade has seen the emergence of more openly available, large scale +citation projects like Microsoft Academic\citep{sinha2015overview} or the +Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021, +according to \citep{hutchins2021tipping} over 1B citations are publicly +available, marking a tipping point for this category of data. + +\section{Related Work} + +There are a few large scale citation dataset available today. COCI, the +``OpenCitations Index of Crossref open DOI-to-DOI citations'' was first +released 2018-07-29. As of its most recent release\footnote{\url{https://opencitations.net/download}}, on +2021-07-29, it contains +1,094,394,688 citations across 65,835,422 bibliographic +resources\citep{peroni2020opencitations}. + +The WikiCite\footnote{\url{https://meta.wikimedia.org/wiki/WikiCite}} project, +``a Wikimedia initiative to develop open citations and linked bibliographic +data to serve free knowledge'' continously adds citations to its database and +as of 2021-06-28 tracks 253,719,394 citations across 39,994,937 +publications\footnote{\url{http://wikicite.org/statistics.html}}. + +Microsoft Academic Graph\citep{sinha2015overview} is comprised of a number of +entities\footnote{\url{https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema}} +with \emph{PaperReferences} being one relation among many others. As of 2021-06-07\footnote{A recent copy has been preserved at + \url{https://archive.org/details/mag-2021-06-07}} the +\emph{PaperReferences} relation contains 1,832,226,781 rows (edges) across 123,923,466 +bibliographic entities. + +Numerous other projects have been or are concerned with various aspects of +citation discovery and curation as part their feature set, among them Semantic +Scholar\citep{fricke2018semantic}, CiteSeerX\citep{li2006citeseerx} or Aminer\citep{tang2016aminer}. + +As mentioned in \citep{hutchins2021tipping}, the number of openly available +citations is not expected to shrink in the future. + + +\section{Dataset} + +We release the first version of the \emph{refcat} dataset in an format used +internally for storage and to serve queries (and which we call \emph{biblioref} +or \emph{bref} for short). The dataset includes metadata from fatcat, the +Open Library Project and inbound links from the English Wikipedia. The fatcat +project itself aggregates data from variety of open data sources, such as +Crossref\citep{crossref}, PubMed\citep{canese2013pubmed}, +DataCite\citep{brase2009datacite}, DOAJ\citep{doaj}, dblp\citep{ley2002dblp} and others, +as well as metadata generated from analysis of data preserved at the Internet +Archive and active crawls of publication sites on the web. + +The dataset is +integrated into the \href{https://fatcat.wiki}{fatcat website} and allows users +to explore inbound and outbound references\cite{fatcatguidereferencegraph}. + +The format records source and target (fatcat release and work) identifiers, a +few attributes from the metadata (such as year or release stage) as well as +information about the match status and provanance. + +The dataset currently contains 1,323,423,672 citations across 76,327,662 +entities (55,123,635 unique source and 60,244,206 unique target work +identifiers; for 1,303,424,212 - or 98.49\% of all citations - we do have a DOI +for both source and target). +The majority of matches - 1,250,523,321 - are established through identifier +based matching (DOI, PMIC, PMCID, ARXIV, ISBN). 72,900,351 citations are +established through fuzzy matching techniques. + +The majority of citations between \emph{refcat} and COCI overlap, as can be +seen in~Table~\ref{table:cocicmp}. + +\begin{table}[] + \begin{center} + \begin{tabular}{ll} + \toprule + \bf{Set} & \bf{Count} \\ + + \midrule + COCI (C) & 1,094,394,688 \\ + \emph{refcat-doi} (R) & 1,303,424,212 \\ % zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst + C $\cap$ R & 1,007,539,966 \\ + C $\setminus$ R & 86,854,309 \\ + R $\setminus$ C & 295,884,246 + \end{tabular} + \vspace*{2mm} + \caption{Comparison between COCI and \emph{refcat-doi}, a subset of + \emph{refcat} where entities have a known DOI. At least 50\% of the + 295,884,246 references only in \emph{refcat-doi} come from links + recorded within a specific dataset provider (GBIF, DOI prefix: + 10.15468).} + \label{table:cocicmp} + \end{center} +\end{table} + +% zstdcat -T0 /magna/refcat/2021-07-28/BrefDOITable/date-2021-07-28.tsv.zst | pv -l | LC_ALL=C sort -T /sandcrawler-db/tmp-refcat/ -S70% -k3,4 -u | zstd -c -T0 > uniq_34.tsv.zst +% zstdcat -T0 uniq_34.tsv.zst | pv -l | LC_ALL=C cut -f3,4 | zstd -c -T0 > uniq_34_doi.tsv.zst +% find . -name "*.csv" | parallel -j 16 "LC_ALL=C grep -v ^oci, {} | LC_ALL=C cut -d, -f2,3" | pv -l | zstd -c -T0 > ../6741422v10_doi_only.csv.zst + + +\section{System Design} + +The constraints for the systems design are informed by the volume and the +variety of the data. The capability to run the whole graph derivation on a +single machine was a minor goal as well. In total, the raw inputs amount to a +few terabytes of textual content, mostly newline delimited JSON. More +importantly, while the number of data fields is low, certain schemas are very +partial with hundreds of different combinations of available field values found +in the raw reference data. This is most likely caused by aggregators passing on +reference data coming from hundreds of sources, each of which not necessarily +agreeing on a common granularity for citation data and from artifacts of +machine learning based structured data extraction tools. + +Each combination of fields may require a slightly different processing path. +For example, references with an Arxiv identifier can be processed differently +from references with only a title. Over 50\% of the raw reference data comes +from a set of eight field set manifestations, as listed in +Table~\ref{table:fields}. + +\begin{table}[] + \begin{center} + \begin{tabular}{ll} + \toprule + \bf{Fields} & \bf{Percentage} \\ + \midrule + \multicolumn{1}{l}{CN $\cdot$ RN $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 14\% \\ + \multicolumn{1}{l}{\textbf{DOI}} & 14\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ IS $\cdot$ P $\cdot$ T $\cdot$ U $\cdot$ V $\cdot$ Y} & 5\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ U $\cdot$ V $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{\textbf{PMID} $\cdot$ U} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ T $\cdot$ V $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ Y} & 4\% \\ + \multicolumn{1}{l}{CN $\cdot$ CRN $\cdot$ \textbf{DOI} $\cdot$ V $\cdot$ Y} & 4\% \\ + \end{tabular} + \vspace*{2mm} + \caption{Top 8 combinations of available fields in raw reference data + accounting for about 53\% of the total data (CN = container name, CRN = + contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = + issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value. Identifiers emphasized.} + \label{table:fields} + \end{center} +\end{table} + +Overall, a map-reduce style\citep{dean2010mapreduce} approach is +followed\footnote{While the operations are similar, the processing is not + distributed but runs on a single machine. For space efficiency, zstd\citep{collet2018zstandard} is used to compress raw data and derivations.}, which allows +for some +uniformity in the overall processing. We extract (key, document) tuples (as +TSV) from the raw JSON data and sort by key. We then group documents with the +same key and apply a function on each group in order to generate +our target schema or perform +additional operations such as deduplication or fusion of matched and unmatched references. + +The key derivation can be exact (via an identifier like DOI, PMID, etc) or +based on a value normalization, like slugifying a title string. For identifier +based matches we can generate the target schema directly. For fuzzy matching +candidates, we pass possible match pairs through a verification procedure, +which is implemented for \emph{release entity}\footnote{\url{https://guide.fatcat.wiki/entity_release.html}.} pairs. This procedure is a +domain dependent rule based verification, able to identify different versions +of a publication, preprint-published pairs and documents, which are +are similar by various metrics calculated over title and author fields. The fuzzy matching +approach is applied on all reference documents without identifier (a title is +currently required). + +With a few schema conversions, fuzzy matching can be applied to Wikipedia +articles and Open Library (edition) records as well. The aspect of precision +and recall are represented by the two stages: we are generous in the match +candidate generation phase in order to improve recall, but we are strict during +verification, in order to control precision. Quality assurance for verification is +implemented through a growing list of test cases of real examples from the catalog and +their expected or desired match status\footnote{The list can be found under: + \url{https://gitlab.com/internetarchive/cgraph/-/blob/master/skate/testdata/verify.csv}. + It is helpful to keep this test suite independent of any specific programming language.}. + + +\section{Limitations and Future Work} + +As other dataset in this field we expect this dataset to be iterated upon. + +\begin{itemize} + \item The fatcat catalog updates its metadata + continously\footnote{A changelog can currenly be followed here: + \url{https://fatcat.wiki/changelog}} and web crawls are conducted + regularly. Current processing pipelines cover raw reference snapshot + creation and derivation of the graph structure, which allows to rerun + processing based on updated data as it becomes available. + + \item Metadata extraction from PDFs depends on supervised machine learning + models, which in turn depend on available training datasets. With additional crawls and + metadata available we hope to improve models used for metadata + extraction, improving yield and reducing data extraction artifacts in + the process. + + \item As of this version, a number of raw reference + docs remain unmatched, which means that neither exact nor fuzzy matching + has detected a link to a known entity. On the one + hand, this can hint at missing metadata. However, parts of the data + will contain a reference to a catalogued entity, but in a specific, + dense and harder to recover form. + This also include improvements to the fuzzy matching approach. + \item The reference dataset contains millions of URLs and their integration + into the graph has been implemented as prototype. A full implementation + requires a few data cleanup and normalization steps. +\end{itemize} + +\section{Acknowledgements} + +This work is partially supported by a grant from the \emph{Andrew W. Mellon + Foundation}. + + +\section{Appendix A} + + +A note on data quality: While we implement various data quality measures, +real-world data, especially coming from many different sources will contain +issues. Among other measures, we keep track of match reasons, +especially for fuzzy matching to be able to zoom in on systematic errors +more easily (see~Table~\ref{table:matches}). + +\begin{table}[] + \footnotesize + \captionsetup{font=normalsize} + \begin{center} + \begin{tabular}{@{}rlll@{}} + \toprule + \textbf{Count} & \textbf{Provenance} & \textbf{Status} & \textbf{Reason} \\ \midrule + 934932865 & crossref & exact & doi \\ + 151366108 & fatcat-datacite & exact & doi \\ + 65345275 & fatcat-pubmed & exact & pmid \\ + 48778607 & fuzzy & strong & jaccardauthors \\ + 42465250 & grobid & exact & doi \\ + 29197902 & fatcat-pubmed & exact & doi \\ + 19996327 & fatcat-crossref & exact & doi \\ + 11996694 & fuzzy & strong & slugtitleauthormatch \\ + 9157498 & fuzzy & strong & tokenizedauthors \\ + 3547594 & grobid & exact & arxiv \\ + 2310025 & fuzzy & exact & titleauthormatch \\ + 1496515 & grobid & exact & pmid \\ + 680722 & crossref & strong & jaccardauthors \\ + 476331 & fuzzy & strong & versioneddoi \\ + 449271 & grobid & exact & isbn \\ + 230645 & fatcat-crossref & strong & jaccardauthors \\ + 190578 & grobid & strong & jaccardauthors \\ + 156657 & crossref & exact & isbn \\ + 123681 & fatcat-pubmed & strong & jaccardauthors \\ + 79328 & crossref & exact & arxiv \\ + 57414 & crossref & strong & tokenizedauthors \\ + 53480 & fuzzy & strong & pmiddoipair \\ + 52453 & fuzzy & strong & dataciterelatedid \\ + 47119 & grobid & strong & slugtitleauthormatch \\ + 36774 & fuzzy & strong & arxivversion \\ + % 35311 & fuzzy & strong & customieeearxiv \\ + % 33863 & grobid & exact & pmcid \\ + % 23504 & crossref & strong & slugtitleauthormatch \\ + % 22753 & fatcat-crossref & strong & tokenizedauthors \\ + % 17720 & grobid & exact & titleauthormatch \\ + % 14656 & crossref & exact & titleauthormatch \\ + % 14438 & grobid & strong & tokenizedauthors \\ + % 7682 & fatcat-crossref & exact & arxiv \\ + % 5972 & fatcat-crossref & exact & isbn \\ + % 5525 & fatcat-pubmed & exact & arxiv \\ + % 4290 & fatcat-pubmed & strong & tokenizedauthors \\ + % 2745 & fatcat-pubmed & exact & isbn \\ + % 2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ + % 2273 & fatcat-crossref & strong & slugtitleauthormatch \\ + % 1960 & fuzzy & exact & workid \\ + % 1150 & fatcat-crossref & exact & titleauthormatch \\ + % 1041 & fatcat-pubmed & exact & titleauthormatch \\ + % 895 & fuzzy & strong & figshareversion \\ + % 317 & fuzzy & strong & titleartifact \\ + % 82 & grobid & strong & titleartifact \\ + % 33 & crossref & strong & titleartifact \\ + % 5 & fuzzy & strong & custombsiundated \\ + % 1 & fuzzy & strong & custombsisubdoc \\ + % 1 & fatcat & exact & doi \\ \bottomrule + \end{tabular} + \vspace*{2mm} + \caption{Table of match counts (top 25), reference provenance, match status and + match reason. The match reason identifier encode a specific rule in the domain + dependent verification process and are included for completeness - we do not + include the details of each rule in this report.} + \label{table:matches} + \end{center} +\end{table} + +\bibliographystyle{abbrv} +% \bibliographystyle{plainnat} +\bibliography{refs} +\end{document} diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib new file mode 100644 index 0000000..c61021e --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/refs.bib @@ -0,0 +1,228 @@ +@inproceedings{kour2014real, + title={Real-time segmentation of on-line handwritten arabic script}, + author={Kour, George and Saabne, Raid}, + booktitle={Frontiers in Handwriting Recognition (ICFHR), 2014 14th International Conference on}, + pages={417--422}, + year={2014}, + organization={IEEE} +} + +@inproceedings{kour2014fast, + title={Fast classification of handwritten on-line Arabic characters}, + author={Kour, George and Saabne, Raid}, + booktitle={Soft Computing and Pattern Recognition (SoCPaR), 2014 6th International Conference of}, + pages={312--318}, + year={2014}, + organization={IEEE}, + doi={10.1109/SOCPAR.2014.7008025} +} + +@article{hadash2018estimate, + title={Estimate and Replace: A Novel Approach to Integrating Deep Neural Networks with Existing Applications}, + author={Hadash, Guy and Kermany, Einat and Carmeli, Boaz and Lavi, Ofer and Kour, George and Jacovi, Alon}, + journal={arXiv preprint arXiv:1804.09028}, + year={2018} +} + +@article{garfield1955citation, + title={Citation indexes for science}, + author={Garfield, Eugene}, + journal={Science}, + volume={122}, + number={3159}, + pages={108--111}, + year={1955}, + publisher={JSTOR} +} + +@inproceedings{lopez2009grobid, + title={GROBID: Combining automatic bibliographic data recognition and term extraction for scholarship publications}, + author={Lopez, Patrice}, + booktitle={International conference on theory and practice of digital libraries}, + pages={473--474}, + year={2009}, + organization={Springer} +} + +@article{garfield2007evolution, + title={The evolution of the science citation index}, + author={Garfield, Eugene}, + journal={International microbiology}, + volume={10}, + number={1}, + pages={65}, + year={2007} +} + +@article{shotton2013publishing, + title={Publishing: open citations}, + author={Shotton, David}, + journal={Nature News}, + volume={502}, + number={7471}, + pages={295}, + year={2013} +} + +@misc{CitEc, + title = {Citations in Economics}, + howpublished = {\url{https://citec.repec.org/}}, + note = {Accessed: 2021-07-30} +} + +@inproceedings{wu2019citeseerx, + title={CiteSeerX: 20 years of service to scholarly big data}, + author={Wu, Jian and Kim, Kunho and Giles, C Lee}, + booktitle={Proceedings of the Conference on Artificial Intelligence for Data Discovery and Reuse}, + pages={1--4}, + year={2019} +} + +@inproceedings{li2006citeseerx, + title={CiteSeerx: an architecture and web service design for an academic document search engine}, + author={Li, Huajing and Councill, Isaac and Lee, Wang-Chien and Giles, C Lee}, + booktitle={Proceedings of the 15th international conference on World Wide Web}, + pages={883--884}, + year={2006} +} + + +@inproceedings{sinha2015overview, + title={An overview of microsoft academic service (mas) and applications}, + author={Sinha, Arnab and Shen, Zhihong and Song, Yang and Ma, Hao and Eide, Darrin and Hsu, Bo-June and Wang, Kuansan}, + booktitle={Proceedings of the 24th international conference on world wide web}, + pages={243--246}, + year={2015} +} + +@misc{i4oc, + title = {Initiative for Open Citations}, +howpublished = {\url{https://i4oc.org/}}, +note = {Accessed: 2021-07-30} +} + +@misc{fatcatguidereferencegraph, +title = {The Fatcat Guide: Reference Graph (refcat)}, +howpublished = {\url{https://guide.fatcat.wiki/reference_graph.html}}, +note = {Accessed: 2021-08-08} +} + +@misc{crossref, +title = {Crossref}, +howpublished = {\url{https://crossref.org}}, +note = {Accessed: 2021-08-08} +} + +@misc{doaj, +title = {Directory of Open Access Journals}, +howpublished = {\url{https://doaj.org}}, +note = {Accessed: 2021-08-08} +} + +@inproceedings{ley2002dblp, + title={The DBLP computer science bibliography: Evolution, research issues, perspectives}, + author={Ley, Michael}, + booktitle={International symposium on string processing and information retrieval}, + pages={1--10}, + year={2002}, + organization={Springer} +} + + +@inproceedings{brase2009datacite, + title={DataCite-A global registration agency for research data}, + author={Brase, Jan}, + booktitle={2009 fourth international conference on cooperation and promotion of information resources in science and technology}, + pages={257--261}, + year={2009}, + organization={IEEE} +} + +@article{canese2013pubmed, + title={PubMed: the bibliographic database}, + author={Canese, Kathi and Weis, Sarah}, + journal={The NCBI Handbook}, + volume={2}, + pages={1}, + year={2013}, + publisher={National Center for Biotechnology Information (US)} +} + + +@article{shotton2018funders, + title={Funders should mandate open citations.}, + author={Shotton, David}, + journal={Nature}, + volume={553}, + number={7686}, + pages={129--130}, + year={2018}, + publisher={Nature Publishing Group} +} + +@article{hutchins2021tipping, + title={A tipping point for open citation data}, + author={Hutchins, B Ian}, + journal={Quantitative Science Studies}, + pages={1--5}, + year={2021} +} + +@article{silbert1970world, + title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)}, + author={Silbert, Jeffrey M}, + journal={Criminology}, + volume={8}, + pages={107}, + year={1970}, + publisher={HeinOnline} +} + +@article{peroni2020opencitations, + title={OpenCitations, an infrastructure organization for open scholarship}, + author={Peroni, Silvio and Shotton, David}, + journal={Quantitative Science Studies}, + volume={1}, + number={1}, + pages={428--444}, + year={2020}, + publisher={MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…} +} + +@article{fricke2018semantic, + title={Semantic scholar}, + author={Fricke, Suzanne}, + journal={Journal of the Medical Library Association: JMLA}, + volume={106}, + number={1}, + pages={145}, + year={2018}, + publisher={Medical Library Association} +} + +@inproceedings{tang2016aminer, + title={AMiner: Toward understanding big scholar data}, + author={Tang, Jie}, + booktitle={Proceedings of the ninth ACM international conference on web search and data mining}, + pages={467--467}, + year={2016} +} + +@article{dean2010mapreduce, + title={MapReduce: a flexible data processing tool}, + author={Dean, Jeffrey and Ghemawat, Sanjay}, + journal={Communications of the ACM}, + volume={53}, + number={1}, + pages={72--77}, + year={2010}, + publisher={ACM New York, NY, USA} +} + +@article{collet2018zstandard, + title={Zstandard Compression and the application/zstd Media Type}, + author={Collet, Yann and Kucherawy, Murray}, + journal={RFC 8478}, + year={2018} +} + diff --git a/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty new file mode 100644 index 0000000..d4d4764 --- /dev/null +++ b/docs/TR-20210808100000-IA-WDS-REFCAT/simpleConference.sty @@ -0,0 +1,136 @@ +% --------------------------------------------------------------- +% Style file for simple, two column conference papers. +% Based on latex8.sty by Paolo.Ienne@di.epfl.ch +% --------------------------------------------------------------- +% Use with LaTeX2e as: +% \documentclass[times,10pt,twocolumn]{article} +% \usepackage{simpleConference} +% \usepackage{times} +% --------------------------------------------------------------- +% specify references as +% \bibliographystyle{simpleConference} +% \bibliography{...your files...} +% +% use Section{} and SubSection{} instead of standard section{} +% and subsection{} to obtain headings in the form +% "1.3. My heading" +% --------------------------------------------------------------- +% ten point helvetica bold required for captions +% in some sites the name of the helvetica bold font may differ, +% change the name here: +\font\tenhv = phvb at 10pt + +% eleven point times bold required for second-order headings +\font\elvbf = ptmb scaled 1100 + +% set dimensions of columns, gap between columns, and paragraph indent +\setlength{\textheight}{8.875in} +\setlength{\textwidth}{6.875in} +\setlength{\columnsep}{0.3125in} +\setlength{\topmargin}{0in} +\setlength{\headheight}{0in} +\setlength{\headsep}{0in} +\setlength{\parindent}{1pc} +\setlength{\oddsidemargin}{-.304in} +\setlength{\evensidemargin}{-.304in} + +% memento from size10.clo +% \normalsize{\@setfontsize\normalsize\@xpt\@xiipt} +% \small{\@setfontsize\small\@ixpt{11}} +% \footnotesize{\@setfontsize\footnotesize\@viiipt{9.5}} +% \scriptsize{\@setfontsize\scriptsize\@viipt\@viiipt} +% \tiny{\@setfontsize\tiny\@vpt\@vipt} +% \large{\@setfontsize\large\@xiipt{14}} +% \Large{\@setfontsize\Large\@xivpt{18}} +% \LARGE{\@setfontsize\LARGE\@xviipt{22}} +% \huge{\@setfontsize\huge\@xxpt{25}} +% \Huge{\@setfontsize\Huge\@xxvpt{30}} + +\def\@maketitle + { + \newpage + \null + \vskip .375in + \begin{center} + {\Large \bf \@title \par} + % additional two empty lines at the end of the title + \vspace*{24pt} + { + \large + \lineskip .5em + \begin{tabular}[t]{c} + \@author + \end{tabular} + \par + } + % additional small space at the end of the author name + \vskip .5em + { + \large + \begin{tabular}[t]{c} + \@affiliation + \end{tabular} + \par + \ifx \@empty \@email + \else + \begin{tabular}{r@{~}l} + E-mail: & {\tt \@email} + \end{tabular} + \par + \fi + } + % additional empty line at the end of the title block + \vspace*{12pt} + \end{center} + } + +\def\abstract + {% + \centerline{\large\bf Abstract}% + \vspace*{12pt}% +% \it% %%%% iroro - commenting out italicized abstract + } + +\def\endabstract + { + % additional empty line at the end of the abstract + \vspace*{12pt} + } + +\def\affiliation#1{\gdef\@affiliation{#1}} \gdef\@affiliation{} + +\def\email#1{\gdef\@email{#1}} +\gdef\@email{} + +\newlength{\@ctmp} +\newlength{\@figindent} +\setlength{\@figindent}{1pc} + +\long\def\@makecaption#1#2{ + \vskip 10pt + \setbox\@tempboxa\hbox{\tenhv\noindent #1.~#2} + \setlength{\@ctmp}{\hsize} + \addtolength{\@ctmp}{-\@figindent}\addtolength{\@ctmp}{-\@figindent} + % IF longer than one indented paragraph line + \ifdim \wd\@tempboxa >\@ctmp + % THEN set as an indented paragraph + \begin{list}{}{\leftmargin\@figindent \rightmargin\leftmargin} + \item[]\tenhv #1.~#2\par + \end{list} + \else + % ELSE center + \hbox to\hsize{\hfil\box\@tempboxa\hfil} + \fi} + +% correct heading spacing and type +\def\section{\@startsection {section}{1}{\z@} + {14pt plus 2pt minus 2pt}{14pt plus 2pt minus 2pt} {\large\bf}} +\def\subsection{\@startsection {subsection}{2}{\z@} + {13pt plus 2pt minus 2pt}{13pt plus 2pt minus 2pt} {\elvbf}} + +% add the period after section numbers +\newcommand{\Section}[1]{\section{\hskip -1em.~#1}} +\newcommand{\SubSection}[1]{\subsection{\hskip -1em.~#1}} + +% end of file latex8.sty +% --------------------------------------------------------------- -- cgit v1.2.3