aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src')
-rw-r--r--scalding/src/main/resources/slug-blacklist.txt426
-rw-r--r--scalding/src/main/scala/sandcrawler/BibjsonScorable.scala50
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala98
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala30
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala29
-rw-r--r--scalding/src/main/scala/sandcrawler/Scorable.scala1
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala17
-rw-r--r--scalding/src/main/scala/sandcrawler/ScoreJob.scala92
-rw-r--r--scalding/src/main/scala/sandcrawler/StringUtilities.scala2
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala57
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala6
-rw-r--r--scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala37
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala27
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala32
15 files changed, 803 insertions, 103 deletions
diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt
index 7dc701f..ad3dc1d 100644
--- a/scalding/src/main/resources/slug-blacklist.txt
+++ b/scalding/src/main/resources/slug-blacklist.txt
@@ -1,34 +1,458 @@
abbreviations
+abbreviationsandacronyms
+aboutauthors
+abouttheauthor
+abouttheauthors
+aboutthecover
+abouttheeditors
+abreviations
abstract
+abstractnotsubmittedforonlinepublication
+abstracts
+abstractsofaapaposterandpodiumpresentations
+abstractsofcommunications
+abstractsofthesesfromthescandinaviancountries
+abstractwithdrawn
+acknowledgement
acknowledgements
+acknowledgementsvii
+acknowledgementtoreferees
+acknowledgment
+acknowledgmentofreferees
+acknowledgments
+addendum
+additionalresources
+address
+advertisersindex
+affect
+affiliation
+agenda
+agradecimientos
+aimsandscope
+annexa
+annualacknowledgementofmanuscriptreviewers
+appendices
+appendix1
+appendixa
+appendixb
+appointmentsandstaffchanges
+approximation
+apresentacao
article
+articles
+articlesofsignificantinterestselectedfromthisissuebytheeditors
+associationnews
+ataglance
+atribute
+authorguidelines
+authorindex
+authorindexforvolume81
authorreply
+authors
authorsreply
+authorsresponse
+avantpropos
+award
+awardsappointmentsannouncements
+backcover
+background
+bibliografia
+bibliography
+bigdata
+blankpage
+blood
+boardoftrustees
+bookofabstracts
bookreview
bookreviews
+bookreviewsandnotices
+bookreviewssection
+booksreceived
+buchbesprechungen
+bulletin
+calendar
+calendarofevents
+calendarofmeetings
+callforarticles
+callforpapers
casereport
+casereports
+chairmansopeningremarks
+changes
+chaos
+chapter1
+chapter10
+chapter1introduction
+chapter7
+chapteri
+chapterone
+chapteroneintroduction
+chaptertwo
+citation
+classes
+classified
+classifieds
+collaborateurs
+comment
+commentaries
commentary
commentaryon
commenton
+comments
commentto
+committee
+communication
+communications
+communicationstotheeditor
+communiquedepresse
+community
+components
+comptesrendus
+computerscience
+conclusion
+conclusions
+conferencereport
+congratulations
+congresscalendar
+conservation
+content
contents
+context
+continuingeducation
+continuingmedicaleducation
+contributors
+copyright
+copyrightform
+copyrightnotice
+correction
+corrections
correspondence
+corrigenda
+corrigendum
+councilminutes
+cover
+coverimage
+currentresearch
+curriculumvitae
+danksagung
+dearreaders
+decisionmaking
dedication
+dedicatoria
+definition
+description
+discussion
+distribution
+documents
+ear
+editorial
editorialadvisoryboard
+editorialboard
+editorialcomment
+editorialcomments
+editorialconsultants
+editoriale
+editorialeditorial
+editorialinformation
+editorialintroduction
+editorialnote
+editorials
+editorialsoftwaresurveysection
+editorialstaff
+editorialstatement
+editorinchief
+editors
+editorschoice
+editorscomment
+editorscorner
+editorscorrespondence
+editorsintroduction
+editorsnote
+editorspicks
+editorspreface
+education
+einfuhrung
+einleitung
+electrophoresis
+employment
+endnotes
+entrevista
+entscheidungsverzeichnis
+epilogue
+equipment
+errata
+erratum
+essay
+essays
+executivesummary
+exercises
+extendedabstracts
+feature
+features
+fichatecnica
+figure3
+finalexam
+finalreport
focus
+foreward
+foreword
+forthcomingarticles
+forthcomingevents
+fortherecord
+forum
+frequentlyaskedquestions
+fromtheeditor
+fromtheeditors
+fromthepresident
+frontmatter
+furtherreadings
+genealogy
+generaldiscussion
+generalinformation
+generalintroduction
+germany
+glosario
+glossary
+glossaryofterms
+guesteditorial
+guideforauthors
+guidelinesforcontributors
+health
+heartfailure
+highlights
+highlightsfromthisissue
+history
+home
+homework
hypothesis
+iii
+importantnotice
+impressum
inbrief
+index
+indexofauthors
+indexofauthorsandtitles
+indice
+indicegeneral
+informationforauthors
+informationtoauthors
+inhalt
+inhaltsverzeichnis
+inmemoriam
+inreply
+insidethisissue
+institutenews
+instructionsforauthors
+instructionstoauthors
+interview
+inthestudy
+inthisissue
+introduccion
introduction
+introductiongenerale
introductiontotheissue
+introductorycomments
+inventions
+invitedcommentary
+issuesandevents
+jobdescription
+keywords
+languageteaching
+lecture
+letter
+letterfromtheeditor
+letters
+letterstotheeditor
lettertotheeditor
+lettertotheeditors
+linearalgebra
+linearregression
+links
+listedestableaux
listofabbreviations
+listofcontributors
+listoffigures
+listofparticipants
+listofpublications
+listofreferees
+listofreviewers
+listoftables
+literacy
+literature
+literaturecited
+literaturrundschau
+litteraturverzeichniss
+livresrecus
+lucina
+lungcancer
+magazin
+maintenance
+materials
+materialsafetydatasheet
+materialsandmethods
+medicinalchemistry
+meetingabstracts
+meetings
+meetingsandconferences
+meetingsofinterest
+membershipapplication
+memoranda
+memorandum
+messagefromgeneralcochairs
+messagefromthechairs
+messagefromtheprogramchairs
+messagefromtheprogramcochairs
+metaanalysis
+missionstatement
+motivation
+mrsnews
+name
+newbooks
+newlyelectedmembersofthecollege
+newproducts
+news
+newsandnotes
+newsandreviews
+newsandviews
+newsviews
+noii
note
+notes
+notesandcomments
+notesandnews
+notesforcontributors
+notesoncontributors
+notice
+noticeboard
+notitle
+notitleavailable
+obituaries
+obituary
+online
+openaccess
+oralabstracts
+oralpresentations
+organizingcommittee
+originalarticle
+originalarticles
+other
+outline
overview
+panorama
+papers
+paperstoappearinforthcomingissues
+partone
+personalandmiscellaneous
+perspective
+perspectives
+place
+positionsavailable
+poster
+posterpresentations
+postscript
preface
+preliminarymaterial
+presentacio
+presentacion
+presentation
+pressrelease
+print
+printing
+proceedings
+profile
+programcommittee
+projectmanagement
+publication
+publichealth
+publishersnote
+question
+radiology
+readersforum
+recensions
+recentpublications
+redaktorensforord
references
+referenciasbibliograficas
+regression
+rehabilitation
+rejoinder
+remerciements
+reply
+replybyauthors
+researchresearchers
+resenas
+resources
+response
+responsetothelettertotheeditor
results
+resume
+resumen
+resumes
+resumo
review
reviewarticle
+revieweracknowledgement
+revieweracknowledgement2013
+reviewers
+reviewessay
+reviews
+reviewsanddescriptionsoftablesandbooks
+rezension
+safety
+section
+security
+selectedbibliography
+shortcommunication
+shorternotices
+socialengineering
+sommaire
+sommario
+specialsection
+specifications
+subjectindex
+subscriptions
+suggestedreadings
+sumario
+summaries
+summariesofkeyjournalarticles
summary
+summaryofproceedings
+summer
+sun
+supplementarymaterial
+symposium
+symptom
+synthese
+tabledesmatieres
+tableofcontents
+tableofcontentsandprologue
+technicalreport
+theauthors
+thebasics
+theeditorsdesk
+thefirstauthorreplies
+thelancet
+theoreticalbackground
+thetimes
+theworldbank
+theyearinreview
+thismonthin
+timemanagement
+titeleiinhaltsverzeichnis
title
-name
+titlepage
+titlepagei
+tocorrespondents
+totheeditor
+unitedkingdom
+unitednations
+unitedstates
+upcomingevents
+vorwort
+website
+welcome
+whatshappening
+whatsnew
+workscited
+yourquestionsanswered
+zusammenfassung
diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
new file mode 100644
index 0000000..cdd598f
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala
@@ -0,0 +1,50 @@
+package sandcrawler
+
+import scala.math
+import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONObject
+
+import cascading.flow.FlowDef
+import cascading.tuple.Fields
+import com.twitter.scalding._
+import com.twitter.scalding.typed.TDsl._
+// XXX: import parallelai.spyglass.hbase.HBasePipeConversions
+
+// XXX: class BibjsonScorable extends Scorable with HBasePipeConversions {
+
+class BibjsonScorable extends Scorable {
+
+ def getSource(args : Args) : Source = {
+ TextLine(args("bibjson-input"))
+ }
+
+ def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
+ getSource(args).read
+ .toTypedPipe[String](new Fields("line"))
+ .map { BibjsonScorable.bibjsonToMapFeatures(_) }
+ }
+}
+
+object BibjsonScorable {
+ def bibjsonToMapFeatures(json : String) : MapFeatures = {
+ Scorable.jsonToMap(json) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ val doi = Scorable.getString(map, "doi")
+ val sha1 = Scorable.getString(map, "sha")
+ // TODO: year, authors (if available)
+ if (title == null || title.isEmpty) {
+ new MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1)
+ new MapFeatures(sf.toSlug, sf.toString)
+ }
+ } else {
+ new MapFeatures(Scorable.NoSlug, json)
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 5d1eaf5..039fa85 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -2,6 +2,7 @@ package sandcrawler
import scala.math
import scala.util.parsing.json.JSON
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
import cascading.flow.FlowDef
@@ -19,29 +20,100 @@ class CrossrefScorable extends Scorable with HBasePipeConversions {
def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = {
getSource(args).read
.toTypedPipe[String](new Fields("line"))
+ .filter { CrossrefScorable.keepRecord(_) }
.map { CrossrefScorable.jsonToMapFeatures(_) }
}
}
object CrossrefScorable {
+
+ val ContentTypeWhitelist: Set[String] = Set(
+ "book",
+ "book-chapter",
+ "dataset",
+ "dissertation",
+ "journal-article",
+ "letter",
+ "monograph",
+ "posted-content",
+ "pre-print",
+ "proceedings-article",
+ "report",
+ "working-paper")
+
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ mapToTitle(map) match {
+ case None => false
+ case Some(title) => title.length <= Scorable.MaxTitleLength
+ }
+ }
+ }
+ }
+
+ // Returns None if title is null, empty, or too long.
+ def mapToTitle(map : Map[String, Any]) : Option[String] = {
+ if (map contains "title") {
+ val titles = map("title").asInstanceOf[List[String]]
+ if (titles.isEmpty || titles == null) {
+ None
+ } else {
+ val title = titles(0)
+ if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
+ }
+ } else {
+ None
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "author") {
+ val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ // TODO(bnewbold): combine given and family names?
+ objArray
+ .filter(e => e contains "family")
+ .map(e => e.get("family").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
+ def mapToYear(map : Map[String, Any]) : Option[Int] = {
+ map.get("created") match {
+ case None => None
+ case Some(created) => {
+ Some(created.asInstanceOf[Map[String,Any]]
+ .get("date-parts")
+ .get
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[List[Any]](0)
+ .asInstanceOf[Double]
+ .toInt)
+ }
+ }
+ }
+
def jsonToMapFeatures(json : String) : MapFeatures = {
Scorable.jsonToMap(json) match {
case None => MapFeatures(Scorable.NoSlug, json)
- case Some(map) => {
- if ((map contains "title") && (map contains "DOI")) {
- val titles = map("title").asInstanceOf[List[String]]
- val doi = Scorable.getString(map, "DOI")
- if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) {
- new MapFeatures(Scorable.NoSlug, json)
- } else {
- // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
- new MapFeatures(sf.toSlug, sf.toString)
+ case Some(map) =>
+ mapToTitle(map) match {
+ case None => MapFeatures(Scorable.NoSlug, json)
+ case Some(title) => {
+ val doi = Scorable.getString(map, "DOI")
+ val authors: List[String] = mapToAuthorList(map)
+ val year: Int = mapToYear(map).getOrElse(0)
+ val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE")
+ if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) {
+ MapFeatures(Scorable.NoSlug, json)
+ } else {
+ val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year)
+ MapFeatures(sf.toSlug, sf.toString)
+ }
}
- } else {
- new MapFeatures(Scorable.NoSlug, json)
}
- }
}
}
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index e510f75..c55cb40 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -31,11 +31,37 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
}
// TODO: Should I combine next two stages for efficiency?
.collect { case (key, json, StatusOK) => (key, json) }
+ .filter { case (key, json) => GrobidScorable.keepRecord(json) }
.map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) }
}
}
object GrobidScorable {
+ def keepRecord(json : String) : Boolean = {
+ Scorable.jsonToMap(json) match {
+ case None => false
+ case Some(map) => {
+ if (map contains "title") {
+ val title = Scorable.getString(map, "title")
+ title != null && title.length <= Scorable.MaxTitleLength
+ } else {
+ false
+ }
+ }
+ }
+ }
+
+ def mapToAuthorList(map : Map[String, Any]) : List[String] = {
+ if (map contains "authors") {
+ val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]])
+ objArray
+ .filter(e => e contains "name")
+ .map(e => e.get("name").get.asInstanceOf[String])
+ } else {
+ List()
+ }
+ }
+
def getHBaseSource(table : String, host : String) : HBaseSource = {
HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL)
}
@@ -45,7 +71,9 @@ object GrobidScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures
+ val authors: List[String] = mapToAuthorList(map)
+ val title = Scorable.getString(map, "title")
+ ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures
} else {
MapFeatures(Scorable.NoSlug, json)
}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
index 468b68e..f4e84fe 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala
@@ -5,8 +5,6 @@ import cascading.flow.FlowDef
import cascading.pipe.Pipe
import cascading.tuple.Fields
import com.twitter.scalding._
-import com.twitter.scalding._
-import com.twitter.scalding.typed.TDsl._
import com.twitter.scalding.typed.TDsl._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
new file mode 100644
index 0000000..1578258
--- /dev/null
+++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala
@@ -0,0 +1,29 @@
+package sandcrawler
+
+import cascading.pipe.Pipe
+import com.twitter.scalding.Args
+import com.twitter.scalding.TypedPipe
+import com.twitter.scalding.TypedTsv
+import parallelai.spyglass.base.JobBase
+
+class MatchBenchmarkJob(args: Args) extends JobBase(args) {
+ // TODO: Instantiate any subclass of Scorable specified in args.
+ val sc1 : Scorable = new BibjsonScorable()
+ val sc2 : Scorable = new BibjsonScorable()
+ val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson")))
+ val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson")))
+ val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs)
+ val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs)
+
+ pipe1.join(pipe2).map { entry =>
+ val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(features1, features2),
+ features1.json,
+ features2.json)
+ }
+ //TypedTsv doesn't work over case classes.
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
+}
diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala
index 9b9c633..5aac032 100644
--- a/scalding/src/main/scala/sandcrawler/Scorable.scala
+++ b/scalding/src/main/scala/sandcrawler/Scorable.scala
@@ -30,6 +30,7 @@ abstract class Scorable {
}
object Scorable {
+ val MaxTitleLength = 1023
val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable
def isValidSlug(slug : String) : Boolean = {
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 0b9868a..241db79 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -3,6 +3,7 @@ package sandcrawler
import java.io.InputStream
import scala.io.Source
+import scala.util.parsing.json.JSONArray
import scala.util.parsing.json.JSONObject
object ScorableFeatures {
@@ -10,11 +11,13 @@ object ScorableFeatures {
val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
fileStream.close
+ val MinSlugLength = 8
// Static factory method
- def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
new ScorableFeatures(
title=if (title == null) "" else title,
+ authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a),
year=year,
doi=if (doi == null) "" else doi,
sha1=if (sha1 == null) "" else sha1)
@@ -23,13 +26,14 @@ object ScorableFeatures {
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
-class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") {
def toMap() : Map[String, Any] =
- Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
+ Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1)
- override def toString() : String =
+ override def toString() : String = {
JSONObject(toMap).toString
+ }
def toSlug() : String = {
if (title == null) {
@@ -38,7 +42,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty
+ || slug == null
+ || (ScorableFeatures.SlugBlacklist contains slug)
+ || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug
}
}
diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
index 28e9132..107f504 100644
--- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala
@@ -2,59 +2,63 @@ package sandcrawler
import cascading.pipe.Pipe
import com.twitter.scalding.Args
+import com.twitter.scalding.Stat
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.TypedTsv
import parallelai.spyglass.base.JobBase
class ScoreJob(args: Args) extends JobBase(args) {
- // TODO: Instantiate any subclass of Scorable specified in args.
- val sc1 : Scorable = new GrobidScorable()
- val sc2 : Scorable = new CrossrefScorable()
- val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args)
- val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args)
- pipe1
- .addTrap(TypedTsv(args("output") + ".trapped"))
- .join(pipe2)
- .map { entry =>
- val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry
- new ReduceOutput(
- slug,
- Scorable.computeSimilarity(features1, features2),
- features1.json,
- features2.json)
- }
- //TypedTsv doesn't work over case classes.
- .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
- .write(TypedTsv[(String, Int, String, String)](args("output")))
-}
-
-/*
-// Ugly hack to get non-String information into ScoreJob above.
-object ScoreJob {
- var scorable1 : Option[Scorable] = None
- var scorable2 : Option[Scorable] = None
+ val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler")
+ val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler")
+ val joinedRowCount = Stat("joined-rows", "sandcrawler")
+ /* TODO:
+ val uniqueDoiCount = Stat("unique-doi-count", "sandcrawler")
+ val uniqueSha1Count = Stat("unique-sha1-count", "sandcrawler")
+ */
- def setScorable1(s : Scorable) {
- scorable1 = Some(s)
- }
-
- def getScorable1() : Scorable = {
- scorable1 match {
- case Some(s) => s
- case None => null
+ val grobidScorable : Scorable = new GrobidScorable()
+ val crossrefScorable : Scorable = new CrossrefScorable()
+ val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable
+ .getInputPipe(args)
+ .map { r =>
+ grobidRowCount.inc
+ r
+ }
+ val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable
+ .getInputPipe(args)
+ .map { r =>
+ crossrefRowCount.inc
+ r
}
- }
- def setScorable2(s: Scorable) {
- scorable2 = Some(s)
- }
+ val joinedPipe = grobidPipe
+ .addTrap(TypedTsv(args("output") + ".trapped"))
+ .join(crossrefPipe)
+
+ /* TODO:
+ // Reduces to count unique SHA1 and DOI
+ joinedPipe
+ .map { case (_, (grobidFeatures, _)) => grobidFeatures.sha }
+ .distinct
+ .map { _ => uniqueSha1Count.inc }
+ joinedPipe
+ .map { case (_, (_, crossrefFeatures)) => crossrefFeatures.doi }
+ .distinct
+ .map { _ => uniqueDoiCount.inc }
+ */
- def getScorable2() : Scorable = {
- scorable2 match {
- case Some(s) => s
- case None => null
+ // TypedTsv doesn't work over case classes.
+ joinedPipe
+ .map { case (slug, (grobidFeatures, crossrefFeatures)) =>
+ joinedRowCount.inc
+ //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry
+ new ReduceOutput(
+ slug,
+ Scorable.computeSimilarity(grobidFeatures, crossrefFeatures),
+ grobidFeatures.json,
+ crossrefFeatures.json)
}
- }
+ .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) }
+ .write(TypedTsv[(String, Int, String, String)](args("output")))
}
- */
diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
index 2745875..e03b60d 100644
--- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala
+++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala
@@ -36,7 +36,7 @@ object StringUtilities {
// Source: https://stackoverflow.com/a/30076541/631051
def removePunctuation(s: String) : String = {
- s.replaceAll("""[\p{Punct}]""", "")
+ s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "")
}
// Adapted from: https://stackoverflow.com/a/16018452/631051
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index 1789d1a..f598cae 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -64,12 +64,18 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
"issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
"subject" : [ "Pediatrics, Perinatology, and Child Health" ]
}
-"""
+""".replace("<<DOI>>", "10.123/aBc")
// scalastyle:on
- val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithGoodTitle = CrossrefString.replace("<<TITLE>>", "Some Title")
+ val CrossrefStringWithMaximumTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val CrossrefStringWithExcessiveTitle = CrossrefString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val CrossrefStringWithNullTitle = CrossrefString.replace("\"<<TITLE>>\"", "null")
val CrossrefStringWithEmptyTitle = CrossrefString.replace("<<TITLE>>", "")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
+ val CrossrefStringWithNoAuthors = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("author", "no-author")
+ val CrossrefStringWrongType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("journal-article", "other")
+ val CrossrefStringNoType = CrossrefString.replace("<<TITLE>>", "Some Valid Title").replace("type", "not-type")
// Unit tests
"CrossrefScorable.jsonToMapFeatures()" should "handle invalid JSON" in {
@@ -82,19 +88,64 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle null title" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNullTitle)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle empty title" in {
val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithEmptyTitle)
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle missing authors" in {
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle valid input" in {
- val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithTitle)
+ val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithGoodTitle)
result.slug shouldBe "sometitle"
Scorable.jsonToMap(result.json) match {
case None => fail()
case Some(map) => {
map("title").asInstanceOf[String] shouldBe "Some Title"
+ map("doi").asInstanceOf[String] shouldBe "10.123/abc"
+ // TODO: full name? not just a string?
+ map("authors").asInstanceOf[List[String]] shouldBe List("Gaier")
+ map("year").asInstanceOf[Double].toInt shouldBe 2002
}
}
}
+
+ "CrossrefScorable.keepRecord()" should "return true for valid JSON with title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ CrossrefScorable.keepRecord(CrossrefStringWithoutTitle) shouldBe false
+ }
+
+ it should "handle content types" in {
+ val resultWrong = CrossrefScorable.jsonToMapFeatures(CrossrefStringWrongType)
+ resultWrong.slug shouldBe Scorable.NoSlug
+ val resultMissing = CrossrefScorable.jsonToMapFeatures(CrossrefStringNoType)
+ resultMissing.slug shouldBe Scorable.NoSlug
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
index 12e13dc..bf9343b 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableDumpJobTest.scala
@@ -78,14 +78,14 @@ class GrobidScorableDumpJobTest extends FlatSpec with Matchers {
"sha1:024937534094897039547e9824382943") // bad status
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Classic"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2")
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG")
)
// bnewbold: status codes aren't strings, they are uint64
diff --git a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
index 661824b..119cf90 100644
--- a/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/GrobidScorableTest.scala
@@ -57,7 +57,10 @@ class GrobidScorableTest extends FlatSpec with Matchers {
"annex": null
}
"""
- val GrobidStringWithTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithGoodTitle = GrobidString.replace("<<TITLE>>", "Dummy Example File")
+ val GrobidStringWithMaximumTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength)
+ val GrobidStringWithExcessiveTitle = GrobidString.replace("<<TITLE>>", "T" * Scorable.MaxTitleLength + "0")
+ val GrobidStringWithNullTitle = GrobidString.replace("\"<<TITLE>>\"", "null")
val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
val MalformedGrobidString = GrobidString.replace("}", "")
val Key = "Dummy Key"
@@ -69,20 +72,50 @@ class GrobidScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle null title" in {
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithNullTitle)
+ result.slug shouldBe Scorable.NoSlug
+ }
+
it should "handle missing title" in {
val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithoutTitle)
result.slug shouldBe Scorable.NoSlug
}
it should "handle valid input" in {
- val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithTitle)
+ val result = GrobidScorable.jsonToMapFeatures(Key, GrobidStringWithGoodTitle)
result.slug shouldBe "dummyexamplefile"
Scorable.jsonToMap(result.json) match {
case None => fail()
case Some(map) => {
map should contain key "title"
map("title").asInstanceOf[String] shouldBe "Dummy Example File"
+ map("authors").asInstanceOf[List[String]] shouldBe List("Brewster Kahle", "J Doe")
}
}
}
+
+ "GrobidScorable.keepRecord()" should "return true for valid JSON with title" in {
+ GrobidScorable.keepRecord(GrobidStringWithGoodTitle) shouldBe true
+ }
+
+ it should "return true for valid JSON with a title of maximum permitted length" in {
+ GrobidScorable.keepRecord(GrobidStringWithMaximumTitle) shouldBe true
+ }
+
+ it should "return false for valid JSON with excessively long title" in {
+ GrobidScorable.keepRecord(GrobidStringWithExcessiveTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with null title" in {
+ GrobidScorable.keepRecord(GrobidStringWithNullTitle) shouldBe false
+ }
+
+ it should "return false for valid JSON with no title" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
+
+ it should "return false for invalid JSON" in {
+ GrobidScorable.keepRecord(GrobidStringWithoutTitle) shouldBe false
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 5a22ef8..474f69a 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -9,22 +9,6 @@ import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
- // TODO: Remove this when we're convinced that our file-reading code
- // works. (I'm already convinced. --Ellen)
- "read slugs" should "work" in {
- val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
- "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
- "casereport", "commentary", "commentaryon", "commenton", "commentto",
- "contents", "correspondence", "dedication", "editorialadvisoryboard",
- "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
- "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
- "references", "results", "review", "reviewarticle", "summary", "title",
- "name")
-
- ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size
- for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s)
- }
-
private def titleToSlug(s : String) : String = {
ScorableFeatures.create(title = s).toSlug
}
@@ -52,7 +36,7 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
it should "strip punctuation" in {
titleToSlug("HELLO!:the:re") shouldBe "hellothere"
- titleToSlug("a:b:c") shouldBe "abc"
+ titleToSlug("a:b:cdefgh") shouldBe "abcdefgh"
titleToSlug(
"If you're happy and you know it, clap your hands!") shouldBe "ifyourehappyandyouknowitclapyourhands"
titleToSlug(":;\"\'") shouldBe Scorable.NoSlug
@@ -65,14 +49,19 @@ class ScorableFeaturesTest extends FlatSpec with Matchers {
}
it should "strip special characters" in {
- titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_") shouldBe Scorable.NoSlug
+ titleToSlug(":;!',|\"\'`.#?!-@*/\\=+~%$^{}()[]<>-_’·“”‘’“”«»「」") shouldBe Scorable.NoSlug
// TODO: titleToSlug("©™₨№…") shouldBe Scorable.NoSlug
// TODO: titleToSlug("πµΣσ") shouldBe Scorable.NoSlug
}
it should "remove whitespace" in {
titleToSlug("foo bar : baz ::") shouldBe "foobarbaz"
- titleToSlug("\na\t:b:c") shouldBe "abc"
+ titleToSlug("\na\t:b:cdefghi") shouldBe "abcdefghi"
titleToSlug("\n \t \r ") shouldBe Scorable.NoSlug
}
+
+ it should "skip very short slugs" in {
+ titleToSlug("short") shouldBe Scorable.NoSlug
+ titleToSlug("a longer, more in depth title") shouldBe "alongermoreindepthtitle"
+ }
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 35c31e5..32fb16c 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -117,6 +117,8 @@ class ScoreJobTest extends FlatSpec with Matchers {
}
"""
// scalastyle:on
+ val TooLongOfTitle = "X" * Scorable.MaxTitleLength + "Y" // arbitrary long string
+ val TooShortOfTitle = "X" * (ScorableFeatures.MinSlugLength - 1)
val CrossrefStringWithTitle = CrossrefString.replace("<<TITLE>>", "SomeTitle")
val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
val MalformedCrossrefString = CrossrefString.replace("}", "")
@@ -124,7 +126,9 @@ class ScoreJobTest extends FlatSpec with Matchers {
CrossrefString.replace("<<TITLE>>", "Title 2: TNG").replace("<<DOI>>", "DOI-0"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 2A").replace("<<DOI>>", "DOI-0.5"),
CrossrefString.replace("<<TITLE>>", "Title 1: TNG 3").replace("<<DOI>>", "DOI-0.75"),
- CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"))
+ CrossrefString.replace("<<TITLE>>", "Title 2: Rebooted").replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooLongOfTitle).replace("<<DOI>>", "DOI-1"),
+ CrossrefString.replace("<<TITLE>>", TooShortOfTitle).replace("<<DOI>>", "DOI-1"))
// Pipeline tests
val output = "/tmp/testOutput"
@@ -137,23 +141,28 @@ class ScoreJobTest extends FlatSpec with Matchers {
"sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT",
"sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56",
"sha1:93187A85273589347598473894839443",
- "sha1:024937534094897039547e9824382943")
+ "sha1:024937534094897039547e9824382943",
+ "sha1:93229759932857982837892347893892",
+ "sha1:83229759932857982837892347893892")
val JsonStrings : List[String] = List(
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
JsonString.replace("<<TITLE>>", "Title 2: TNG"),
JsonString.replace("<<TITLE>>", "Title 3: The Sequel"),
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 1"),
+ JsonString.replace("<<TITLE>>", "Title 1: The Original"),
MalformedJsonString,
// This will have bad status.
- JsonString.replace("<<TITLE>>", "Title 2")
+ JsonString.replace("<<TITLE>>", "Title 2: Not TNG"),
+ // These are in both sources but have bad titles
+ JsonString.replace("<<TITLE>>", TooLongOfTitle),
+ JsonString.replace("<<TITLE>>", TooShortOfTitle)
)
// bnewbold: status codes aren't strings, they are uint64
val Ok : Long = 200
val Bad : Long = 400
- val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad)
+ val StatusCodes = List(Ok, Ok, Ok, Bad, Ok, Bad, Ok, Ok)
val SampleDataHead : List[Tuple] = (Sha1Strings, JsonStrings, StatusCodes)
.zipped
@@ -181,19 +190,24 @@ class ScoreJobTest extends FlatSpec with Matchers {
0 -> CrossrefStrings(0),
1 -> CrossrefStrings(1),
2 -> CrossrefStrings(2),
- 3 -> CrossrefStrings(3)))
- .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) {
- _ => () }
+ 3 -> CrossrefStrings(3),
+ 4 -> CrossrefStrings(4),
+ 4 -> CrossrefStrings(5)))
+ .sink[(String, ReduceFeatures)](TypedTsv[(String, ReduceFeatures)](output + ".trapped")) { _ => () }
.sink[(String, Int, String, String)](TypedTsv[(String, Int, String, String)](output)) {
// Grobid titles and slugs (in parentheses):
// Title 1 (title1)
// Title 2: TNG (title2tng)
// Title 3: The Sequel (title3thesequel)
+ // <too long of a title>
+ // <too short of a title>
// crossref titles and slugs (in parentheses):
// Title 2: TNG (title2tng)
// Title 1: TNG 2A (title1tng2a)
// Title 1: TNG 3 (title1tng3)
// Title 2: Rebooted (title2rebooted)
+ // <too long of a title>
+ // <too short of a title>
// XXX: Join should have 3 "title1" slugs and 1 "title2tng" slug
outputBuffer =>
"The pipeline" should "return a 1-element list" in {