From 613bdec83b2344d57e4727602b72fb9ecd740bb6 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 2 Aug 2021 20:03:29 +0200 Subject: wip: tr --- docs/TR-20210730212057-IA-WDS-CG/main.pdf | Bin 93022 -> 98138 bytes docs/TR-20210730212057-IA-WDS-CG/main.tex | 192 +++++++++++++++++++----- docs/TR-20210730212057-IA-WDS-CG/references.bib | 9 ++ 3 files changed, 167 insertions(+), 34 deletions(-) diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.pdf b/docs/TR-20210730212057-IA-WDS-CG/main.pdf index b0d166d..ddff7fe 100644 Binary files a/docs/TR-20210730212057-IA-WDS-CG/main.pdf and b/docs/TR-20210730212057-IA-WDS-CG/main.pdf differ diff --git a/docs/TR-20210730212057-IA-WDS-CG/main.tex b/docs/TR-20210730212057-IA-WDS-CG/main.tex index f08cad0..faeab73 100644 --- a/docs/TR-20210730212057-IA-WDS-CG/main.tex +++ b/docs/TR-20210730212057-IA-WDS-CG/main.tex @@ -17,8 +17,6 @@ \usepackage{natbib} \usepackage{doi} - - \title{Internet Archive Scholar Citation Graph Dataset} \date{August 10, 2021} % Here you can change the date presented in the paper title @@ -97,7 +95,7 @@ used for derivation under an MIT license. % keywords can be removed -\keywords{Citation Graph Dataset \and Scholarly Communications \and Web Archiving} +\keywords{Citation Graph \and Scholarly Communications \and Web Archiving} \section{Introduction} @@ -106,7 +104,7 @@ The Internet Archive releases a first version of a citation graph dataset derived from a raw corpus of about 2.5B references gathered from metadata and from data obtained by PDF extraction tools such as GROBID\citep{lopez2009grobid}. The goal of this report is to describe briefly the current contents and the -derivation of the Internet Archive Scholar Citation Graph Dataset (IASCG). We expect +derivation of the Archive Scholar Citations Dataset (ASC). We expect this dataset to be iterated upon, with changes both in content and processing. Modern citation indexes can be traced back to the early computing age, when @@ -122,8 +120,12 @@ Initiative for Open Citations\citep{i4oc}\citep{shotton2018funders}. In 2021, according to \citep{hutchins2021tipping} over 1B citations are publicly available, marking a tipping point for open citations. + + \section{Citation Graph Contents} + + % * edges % * edges exact % * edges fuzzy @@ -155,41 +157,96 @@ available, marking a tipping point for open citations. \section{System Design} -TODO: describe limitations, single machine, prohibitive external data store -lookups, and performance advantages of stream processing; “miniature -map-reduce”, id based matching; fuzzy matching; funnel approach; data quality -issues; live system design (es, pg, …) - -The constraints for the system design are informed by the volume and the -variety of the data. In total, the raw inputs amount to about X TB uncompressed -textual data. More importantly, while the number of data fields is low, over Y -different combinations of fields are found in the raw reference data. Each -combination of fields may require a slightly different processing path. For -example, references with an arxiv identifier can be processed differently from -references with only a title. We identify about X types of manifestations which -in total amount for Y\% of the reference documents. - -Overall, a map-reduce style approach is followed, which e.g. allows for some -uniformity in the overall processing. We extract key value tuples (as TSV) from -the raw JSON data and sort by key. Finally we group pairs with the same key -into groups and apply a function of the elements of the group in order to -generate our target schema (biblioref, called bref, for short). - -The key derivation can be exact (e.g. an id like doi, pmid, etc) or based on a -normalization procedure, like a slugified title string. For id based matches we -can generate the bref schema directly. For fuzzy matching candidates, we pass -possible match pairs through a verification procedure, which is implemented for -documents of one specific catalog record schema. +The constraints for the systems design are informed by the volume and the +variety of the data. In total, the raw inputs amount to a few TB of textual +content, mostly newline delimited JSON. More importantly, while the number of +data fields is low, certain schemas are very partial with hundreds of different +combinations of available field values found in the raw reference data. This is +most likely caused by aggregators passing on reference data coming from +hundreds of sources, each of which not necessarily agreeing on a common +granularity for citation data and from artifacts of machine learning based +structured data extraction tools. + +Each combination of fields may require a slightly different processing path. +For example, references with an Arxiv identifier can be processed differently +from references with only a title. Over 50\% of the raw reference data comes +from a set of eight field manifestations, as listed in +Table~\ref{table:fields}. + +\begin{table}[] + \begin{center} + \begin{tabular}{ll} +\toprule + \bf{Fields} & \bf{Share} \\ +\midrule + \multicolumn{1}{l}{CN|CRN|P|T|U|V|Y} & 14\% \\ + \multicolumn{1}{l}{DOI} & 14\% \\ + \multicolumn{1}{l}{CN|CRN|IS|P|T|U|V|Y} & 5\% \\ + \multicolumn{1}{l}{CN|CRN|DOI|U|V|Y} & 4\% \\ + \multicolumn{1}{l}{PMID|U} & 4\% \\ + \multicolumn{1}{l}{CN|CRN|DOI|T|V|Y} & 4\% \\ + \multicolumn{1}{l}{CN|CRN|Y} & 4\% \\ + \multicolumn{1}{l}{CN|CRN|DOI|V|Y} & 4\% \\ + \end{tabular} + \vspace*{2mm} + \caption{Top 8 combinations of available fields in raw reference data + accounting for about 53\% of the total data (CN = container name, CRN = +contrib raw name, P = pages, T = title, U = unstructured, V = volume, IS = +issue, Y = year, DOI = doi, PMID = pmid). Unstructured fields may contain any value.} + \label{table:fields} +\end{center} +\end{table} + +Overall, a map-reduce style approach is followed, which allows for some +uniformity in the overall processing. We extract (key, document) tuples (as +TSV) from the raw JSON data and sort by key. Then we group documents with the +same key into groups and apply a function on each group in order to generate +our target schema (currently named biblioref, or bref for short) or perform +addition operations (such as deduplication). + +The key derivation can be exact (like an identifier like DOI, PMID, etc) or +based on a normalization procedure, like a slugified title string. For +identifier based matches we can generate the target biblioref schema directly. +For fuzzy matching candidates, we pass possible match pairs through a +verification procedure, which is implemented for release entity schema pairs. +The current verification procedure is a domain dependent rule based +verification, able to identify different versions of a publication, +preprint-published pairs or or other kind of similar documents by calculating +similarity metrics across title and authors. The fuzzy matching approach is +applied on all reference documents, which only have a title, but no identifier. With a few schema conversions, fuzzy matching can be applied to Wikipedia -articles and Open Library editions as well. The aspect of precision and recall -are represented by the two stages: we are generous in the match candidate -generation phase in order to improve recall, but we are strict during -verification, in order to ensure precision. +articles and Open Library (edition) records as well. The aspect of precision +and recall are represented by the two stages: we are generous in the match +candidate generation phase in order to improve recall, but we are strict during +verification, in order to control precision. \section{Fuzzy Matching Approach} -% Take sample of 100 docs, report some precision, recall, F1 on a hand curated small subset. +% Take sample of 100 docs, report some precision, recall, F1 on a hand curated +% small subset. + +The fuzzy matching approach currently implemented works in two phases: match +candidate generation and verification. For candidate generation, we map each +document to a key. We implemented a number of algorithms to form these +clusters, e.g. title normalizations (including lowercasing, whitespace removal, +unicode normalization and other measures) or transformations like +NYSIIS\citep{silbert1970world}. + +The verification approach is based on a set of rules, which are tested +sequentially, yielding a match signal from weak to exact. We use a suite of +over 300 manually curated match examples\footnote{The table can be found here: +\href{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}{https://gitlab.com/internetarchive/fuzzycat/-/blob/master/tests/data/verify.csv}} +as part of a unit test suite to allow for a controlled, continuous adjustement +to the verification procedure. If the verification yields either an exact or +strong signal, we include consider it a match. + +We try to keep the processing steps performant to keep the overall derivation +time limited. Map and reduce operations are parallelized and certain processing +steps can process 100K documents per second or even more on commodity hardware +with spinning disks. + + \section{Discussion} @@ -301,5 +358,72 @@ verification, in order to ensure precision. % \end{thebibliography} +\section{Appendix} + +% Please add the following required packages to your document preamble: +\begin{table}[] + \begin{center} +\begin{tabular}{@{}rlll@{}} +\toprule +\textbf{Number of matches} & \textbf{Citation Provenance} & \textbf{Match Status} & \textbf{Match Reason} \\ \midrule +934932865 & crossref & exact & doi \\ +151366108 & fatcat-datacite & exact & doi \\ +65345275 & fatcat-pubmed & exact & pmid \\ +48778607 & fuzzy & strong & jaccardauthors \\ +42465250 & grobid & exact & doi \\ +29197902 & fatcat-pubmed & exact & doi \\ +19996327 & fatcat-crossref & exact & doi \\ +11996694 & fuzzy & strong & slugtitleauthormatch \\ +9157498 & fuzzy & strong & tokenizedauthors \\ +3547594 & grobid & exact & arxiv \\ +2310025 & fuzzy & exact & titleauthormatch \\ +1496515 & grobid & exact & pmid \\ +680722 & crossref & strong & jaccardauthors \\ +476331 & fuzzy & strong & versioneddoi \\ +449271 & grobid & exact & isbn \\ +230645 & fatcat-crossref & strong & jaccardauthors \\ +190578 & grobid & strong & jaccardauthors \\ +156657 & crossref & exact & isbn \\ +123681 & fatcat-pubmed & strong & jaccardauthors \\ +79328 & crossref & exact & arxiv \\ +57414 & crossref & strong & tokenizedauthors \\ +53480 & fuzzy & strong & pmiddoipair \\ +52453 & fuzzy & strong & dataciterelatedid \\ +47119 & grobid & strong & slugtitleauthormatch \\ +36774 & fuzzy & strong & arxivversion \\ +35311 & fuzzy & strong & customieeearxiv \\ +33863 & grobid & exact & pmcid \\ +23504 & crossref & strong & slugtitleauthormatch \\ +22753 & fatcat-crossref & strong & tokenizedauthors \\ +17720 & grobid & exact & titleauthormatch \\ +14656 & crossref & exact & titleauthormatch \\ +14438 & grobid & strong & tokenizedauthors \\ +7682 & fatcat-crossref & exact & arxiv \\ +5972 & fatcat-crossref & exact & isbn \\ +5525 & fatcat-pubmed & exact & arxiv \\ +4290 & fatcat-pubmed & strong & tokenizedauthors \\ +2745 & fatcat-pubmed & exact & isbn \\ +2342 & fatcat-pubmed & strong & slugtitleauthormatch \\ +2273 & fatcat-crossref & strong & slugtitleauthormatch \\ +1960 & fuzzy & exact & workid \\ +1150 & fatcat-crossref & exact & titleauthormatch \\ +1041 & fatcat-pubmed & exact & titleauthormatch \\ +895 & fuzzy & strong & figshareversion \\ +317 & fuzzy & strong & titleartifact \\ +82 & grobid & strong & titleartifact \\ +33 & crossref & strong & titleartifact \\ +5 & fuzzy & strong & custombsiundated \\ +1 & fuzzy & strong & custombsisubdoc \\ +1 & fatcat & exact & doi \\ \bottomrule +\end{tabular} + \vspace*{2mm} + \caption{Table of match counts, reference provenance, match status and +match reason. The match reason identifier encode a specific rule in the domain +dependent verification process and are included for completeness - we do not +include the details of each rule in this report.} + \label{table:fields} +\end{center} +\end{table} + \end{document} diff --git a/docs/TR-20210730212057-IA-WDS-CG/references.bib b/docs/TR-20210730212057-IA-WDS-CG/references.bib index cf61980..bcb8a16 100644 --- a/docs/TR-20210730212057-IA-WDS-CG/references.bib +++ b/docs/TR-20210730212057-IA-WDS-CG/references.bib @@ -111,4 +111,13 @@ note = {Accessed: 2021-07-30} year={2021} } +@article{silbert1970world, + title={The World's First Computerized Criminal-Justice Information-Sharing System-The New York State Identification and Intelligence System (NYSIIS)}, + author={Silbert, Jeffrey M}, + journal={Criminology}, + volume={8}, + pages={107}, + year={1970}, + publisher={HeinOnline} +} -- cgit v1.2.3