diff options
Diffstat (limited to 'scalding/src/main')
10 files changed, 680 insertions, 67 deletions
diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt index 7dc701f..ad3dc1d 100644 --- a/scalding/src/main/resources/slug-blacklist.txt +++ b/scalding/src/main/resources/slug-blacklist.txt @@ -1,34 +1,458 @@ abbreviations +abbreviationsandacronyms +aboutauthors +abouttheauthor +abouttheauthors +aboutthecover +abouttheeditors +abreviations abstract +abstractnotsubmittedforonlinepublication +abstracts +abstractsofaapaposterandpodiumpresentations +abstractsofcommunications +abstractsofthesesfromthescandinaviancountries +abstractwithdrawn +acknowledgement acknowledgements +acknowledgementsvii +acknowledgementtoreferees +acknowledgment +acknowledgmentofreferees +acknowledgments +addendum +additionalresources +address +advertisersindex +affect +affiliation +agenda +agradecimientos +aimsandscope +annexa +annualacknowledgementofmanuscriptreviewers +appendices +appendix1 +appendixa +appendixb +appointmentsandstaffchanges +approximation +apresentacao article +articles +articlesofsignificantinterestselectedfromthisissuebytheeditors +associationnews +ataglance +atribute +authorguidelines +authorindex +authorindexforvolume81 authorreply +authors authorsreply +authorsresponse +avantpropos +award +awardsappointmentsannouncements +backcover +background +bibliografia +bibliography +bigdata +blankpage +blood +boardoftrustees +bookofabstracts bookreview bookreviews +bookreviewsandnotices +bookreviewssection +booksreceived +buchbesprechungen +bulletin +calendar +calendarofevents +calendarofmeetings +callforarticles +callforpapers casereport +casereports +chairmansopeningremarks +changes +chaos +chapter1 +chapter10 +chapter1introduction +chapter7 +chapteri +chapterone +chapteroneintroduction +chaptertwo +citation +classes +classified +classifieds +collaborateurs +comment +commentaries commentary commentaryon commenton +comments commentto +committee +communication +communications +communicationstotheeditor +communiquedepresse +community +components +comptesrendus +computerscience +conclusion +conclusions +conferencereport +congratulations +congresscalendar +conservation +content contents +context +continuingeducation +continuingmedicaleducation +contributors +copyright +copyrightform +copyrightnotice +correction +corrections correspondence +corrigenda +corrigendum +councilminutes +cover +coverimage +currentresearch +curriculumvitae +danksagung +dearreaders +decisionmaking dedication +dedicatoria +definition +description +discussion +distribution +documents +ear +editorial editorialadvisoryboard +editorialboard +editorialcomment +editorialcomments +editorialconsultants +editoriale +editorialeditorial +editorialinformation +editorialintroduction +editorialnote +editorials +editorialsoftwaresurveysection +editorialstaff +editorialstatement +editorinchief +editors +editorschoice +editorscomment +editorscorner +editorscorrespondence +editorsintroduction +editorsnote +editorspicks +editorspreface +education +einfuhrung +einleitung +electrophoresis +employment +endnotes +entrevista +entscheidungsverzeichnis +epilogue +equipment +errata +erratum +essay +essays +executivesummary +exercises +extendedabstracts +feature +features +fichatecnica +figure3 +finalexam +finalreport focus +foreward +foreword +forthcomingarticles +forthcomingevents +fortherecord +forum +frequentlyaskedquestions +fromtheeditor +fromtheeditors +fromthepresident +frontmatter +furtherreadings +genealogy +generaldiscussion +generalinformation +generalintroduction +germany +glosario +glossary +glossaryofterms +guesteditorial +guideforauthors +guidelinesforcontributors +health +heartfailure +highlights +highlightsfromthisissue +history +home +homework hypothesis +iii +importantnotice +impressum inbrief +index +indexofauthors +indexofauthorsandtitles +indice +indicegeneral +informationforauthors +informationtoauthors +inhalt +inhaltsverzeichnis +inmemoriam +inreply +insidethisissue +institutenews +instructionsforauthors +instructionstoauthors +interview +inthestudy +inthisissue +introduccion introduction +introductiongenerale introductiontotheissue +introductorycomments +inventions +invitedcommentary +issuesandevents +jobdescription +keywords +languageteaching +lecture +letter +letterfromtheeditor +letters +letterstotheeditor lettertotheeditor +lettertotheeditors +linearalgebra +linearregression +links +listedestableaux listofabbreviations +listofcontributors +listoffigures +listofparticipants +listofpublications +listofreferees +listofreviewers +listoftables +literacy +literature +literaturecited +literaturrundschau +litteraturverzeichniss +livresrecus +lucina +lungcancer +magazin +maintenance +materials +materialsafetydatasheet +materialsandmethods +medicinalchemistry +meetingabstracts +meetings +meetingsandconferences +meetingsofinterest +membershipapplication +memoranda +memorandum +messagefromgeneralcochairs +messagefromthechairs +messagefromtheprogramchairs +messagefromtheprogramcochairs +metaanalysis +missionstatement +motivation +mrsnews +name +newbooks +newlyelectedmembersofthecollege +newproducts +news +newsandnotes +newsandreviews +newsandviews +newsviews +noii note +notes +notesandcomments +notesandnews +notesforcontributors +notesoncontributors +notice +noticeboard +notitle +notitleavailable +obituaries +obituary +online +openaccess +oralabstracts +oralpresentations +organizingcommittee +originalarticle +originalarticles +other +outline overview +panorama +papers +paperstoappearinforthcomingissues +partone +personalandmiscellaneous +perspective +perspectives +place +positionsavailable +poster +posterpresentations +postscript preface +preliminarymaterial +presentacio +presentacion +presentation +pressrelease +print +printing +proceedings +profile +programcommittee +projectmanagement +publication +publichealth +publishersnote +question +radiology +readersforum +recensions +recentpublications +redaktorensforord references +referenciasbibliograficas +regression +rehabilitation +rejoinder +remerciements +reply +replybyauthors +researchresearchers +resenas +resources +response +responsetothelettertotheeditor results +resume +resumen +resumes +resumo review reviewarticle +revieweracknowledgement +revieweracknowledgement2013 +reviewers +reviewessay +reviews +reviewsanddescriptionsoftablesandbooks +rezension +safety +section +security +selectedbibliography +shortcommunication +shorternotices +socialengineering +sommaire +sommario +specialsection +specifications +subjectindex +subscriptions +suggestedreadings +sumario +summaries +summariesofkeyjournalarticles summary +summaryofproceedings +summer +sun +supplementarymaterial +symposium +symptom +synthese +tabledesmatieres +tableofcontents +tableofcontentsandprologue +technicalreport +theauthors +thebasics +theeditorsdesk +thefirstauthorreplies +thelancet +theoreticalbackground +thetimes +theworldbank +theyearinreview +thismonthin +timemanagement +titeleiinhaltsverzeichnis title -name +titlepage +titlepagei +tocorrespondents +totheeditor +unitedkingdom +unitednations +unitedstates +upcomingevents +vorwort +website +welcome +whatshappening +whatsnew +workscited +yourquestionsanswered +zusammenfassung diff --git a/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala new file mode 100644 index 0000000..cdd598f --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/BibjsonScorable.scala @@ -0,0 +1,50 @@ +package sandcrawler + +import scala.math +import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONObject + +import cascading.flow.FlowDef +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.typed.TDsl._ +// XXX: import parallelai.spyglass.hbase.HBasePipeConversions + +// XXX: class BibjsonScorable extends Scorable with HBasePipeConversions { + +class BibjsonScorable extends Scorable { + + def getSource(args : Args) : Source = { + TextLine(args("bibjson-input")) + } + + def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { + getSource(args).read + .toTypedPipe[String](new Fields("line")) + .map { BibjsonScorable.bibjsonToMapFeatures(_) } + } +} + +object BibjsonScorable { + def bibjsonToMapFeatures(json : String) : MapFeatures = { + Scorable.jsonToMap(json) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(map) => { + if (map contains "title") { + val title = Scorable.getString(map, "title") + val doi = Scorable.getString(map, "doi") + val sha1 = Scorable.getString(map, "sha") + // TODO: year, authors (if available) + if (title == null || title.isEmpty) { + new MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi, sha1=sha1) + new MapFeatures(sf.toSlug, sf.toString) + } + } else { + new MapFeatures(Scorable.NoSlug, json) + } + } + } + } +} diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5d1eaf5..039fa85 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -2,6 +2,7 @@ package sandcrawler import scala.math import scala.util.parsing.json.JSON +import scala.util.parsing.json.JSONArray import scala.util.parsing.json.JSONObject import cascading.flow.FlowDef @@ -19,29 +20,100 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) + .filter { CrossrefScorable.keepRecord(_) } .map { CrossrefScorable.jsonToMapFeatures(_) } } } object CrossrefScorable { + + val ContentTypeWhitelist: Set[String] = Set( + "book", + "book-chapter", + "dataset", + "dissertation", + "journal-article", + "letter", + "monograph", + "posted-content", + "pre-print", + "proceedings-article", + "report", + "working-paper") + + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + mapToTitle(map) match { + case None => false + case Some(title) => title.length <= Scorable.MaxTitleLength + } + } + } + } + + // Returns None if title is null, empty, or too long. + def mapToTitle(map : Map[String, Any]) : Option[String] = { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty || titles == null) { + None + } else { + val title = titles(0) + if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) + } + } else { + None + } + } + + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "author") { + val objArray = map("author").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + // TODO(bnewbold): combine given and family names? + objArray + .filter(e => e contains "family") + .map(e => e.get("family").get.asInstanceOf[String]) + } else { + List() + } + } + + def mapToYear(map : Map[String, Any]) : Option[Int] = { + map.get("created") match { + case None => None + case Some(created) => { + Some(created.asInstanceOf[Map[String,Any]] + .get("date-parts") + .get + .asInstanceOf[List[Any]](0) + .asInstanceOf[List[Any]](0) + .asInstanceOf[Double] + .toInt) + } + } + } + def jsonToMapFeatures(json : String) : MapFeatures = { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) - case Some(map) => { - if ((map contains "title") && (map contains "DOI")) { - val titles = map("title").asInstanceOf[List[String]] - val doi = Scorable.getString(map, "DOI") - if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { - new MapFeatures(Scorable.NoSlug, json) - } else { - // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) - new MapFeatures(sf.toSlug, sf.toString) + case Some(map) => + mapToTitle(map) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(title) => { + val doi = Scorable.getString(map, "DOI") + val authors: List[String] = mapToAuthorList(map) + val year: Int = mapToYear(map).getOrElse(0) + val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE") + if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) { + MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) + MapFeatures(sf.toSlug, sf.toString) + } } - } else { - new MapFeatures(Scorable.NoSlug, json) } - } } } } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index e510f75..c55cb40 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -31,11 +31,37 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } // TODO: Should I combine next two stages for efficiency? .collect { case (key, json, StatusOK) => (key, json) } + .filter { case (key, json) => GrobidScorable.keepRecord(json) } .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } } } object GrobidScorable { + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + if (map contains "title") { + val title = Scorable.getString(map, "title") + title != null && title.length <= Scorable.MaxTitleLength + } else { + false + } + } + } + } + + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "authors") { + val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + objArray + .filter(e => e contains "name") + .map(e => e.get("name").get.asInstanceOf[String]) + } else { + List() + } + } + def getHBaseSource(table : String, host : String) : HBaseSource = { HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) } @@ -45,7 +71,9 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures + val authors: List[String] = mapToAuthorList(map) + val title = Scorable.getString(map, "title") + ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala index 468b68e..f4e84fe 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorableDumpJob.scala @@ -5,8 +5,6 @@ import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields import com.twitter.scalding._ -import com.twitter.scalding._ -import com.twitter.scalding.typed.TDsl._ import com.twitter.scalding.typed.TDsl._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes diff --git a/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala new file mode 100644 index 0000000..1578258 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/MatchBenchmarkJob.scala @@ -0,0 +1,29 @@ +package sandcrawler + +import cascading.pipe.Pipe +import com.twitter.scalding.Args +import com.twitter.scalding.TypedPipe +import com.twitter.scalding.TypedTsv +import parallelai.spyglass.base.JobBase + +class MatchBenchmarkJob(args: Args) extends JobBase(args) { + // TODO: Instantiate any subclass of Scorable specified in args. + val sc1 : Scorable = new BibjsonScorable() + val sc2 : Scorable = new BibjsonScorable() + val leftArgs = args + ("bibjson-input" -> List(args("left-bibjson"))) + val rightArgs = args + ("bibjson-input" -> List(args("right-bibjson"))) + val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(leftArgs) + val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(rightArgs) + + pipe1.join(pipe2).map { entry => + val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry + new ReduceOutput( + slug, + Scorable.computeSimilarity(features1, features2), + features1.json, + features2.json) + } + //TypedTsv doesn't work over case classes. + .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } + .write(TypedTsv[(String, Int, String, String)](args("output"))) +} diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 9b9c633..5aac032 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -30,6 +30,7 @@ abstract class Scorable { } object Scorable { + val MaxTitleLength = 1023 val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable def isValidSlug(slug : String) : Boolean = { diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 0b9868a..241db79 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -3,6 +3,7 @@ package sandcrawler import java.io.InputStream import scala.io.Source +import scala.util.parsing.json.JSONArray import scala.util.parsing.json.JSONObject object ScorableFeatures { @@ -10,11 +11,13 @@ object ScorableFeatures { val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet fileStream.close + val MinSlugLength = 8 // Static factory method - def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { new ScorableFeatures( title=if (title == null) "" else title, + authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a), year=year, doi=if (doi == null) "" else doi, sha1=if (sha1 == null) "" else sha1) @@ -23,13 +26,14 @@ object ScorableFeatures { // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). Create with above static factory method. -class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", sha1: String = "") { def toMap() : Map[String, Any] = - Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = + override def toString() : String = { JSONObject(toMap).toString + } def toSlug() : String = { if (title == null) { @@ -38,7 +42,10 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugBlacklist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) Scorable.NoSlug else slug } } diff --git a/scalding/src/main/scala/sandcrawler/ScoreJob.scala b/scalding/src/main/scala/sandcrawler/ScoreJob.scala index 28e9132..107f504 100644 --- a/scalding/src/main/scala/sandcrawler/ScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/ScoreJob.scala @@ -2,59 +2,63 @@ package sandcrawler import cascading.pipe.Pipe import com.twitter.scalding.Args +import com.twitter.scalding.Stat import com.twitter.scalding.TypedPipe import com.twitter.scalding.TypedTsv import parallelai.spyglass.base.JobBase class ScoreJob(args: Args) extends JobBase(args) { - // TODO: Instantiate any subclass of Scorable specified in args. - val sc1 : Scorable = new GrobidScorable() - val sc2 : Scorable = new CrossrefScorable() - val pipe1 : TypedPipe[(String, ReduceFeatures)] = sc1.getInputPipe(args) - val pipe2 : TypedPipe[(String, ReduceFeatures)] = sc2.getInputPipe(args) - pipe1 - .addTrap(TypedTsv(args("output") + ".trapped")) - .join(pipe2) - .map { entry => - val (slug : String, (features1 : ReduceFeatures, features2 : ReduceFeatures)) = entry - new ReduceOutput( - slug, - Scorable.computeSimilarity(features1, features2), - features1.json, - features2.json) - } - //TypedTsv doesn't work over case classes. - .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } - .write(TypedTsv[(String, Int, String, String)](args("output"))) -} - -/* -// Ugly hack to get non-String information into ScoreJob above. -object ScoreJob { - var scorable1 : Option[Scorable] = None - var scorable2 : Option[Scorable] = None + val grobidRowCount = Stat("grobid-rows-filtered", "sandcrawler") + val crossrefRowCount = Stat("crossref-rows-filtered", "sandcrawler") + val joinedRowCount = Stat("joined-rows", "sandcrawler") + /* TODO: + val uniqueDoiCount = Stat("unique-doi-count", "sandcrawler") + val uniqueSha1Count = Stat("unique-sha1-count", "sandcrawler") + */ - def setScorable1(s : Scorable) { - scorable1 = Some(s) - } - - def getScorable1() : Scorable = { - scorable1 match { - case Some(s) => s - case None => null + val grobidScorable : Scorable = new GrobidScorable() + val crossrefScorable : Scorable = new CrossrefScorable() + val grobidPipe : TypedPipe[(String, ReduceFeatures)] = grobidScorable + .getInputPipe(args) + .map { r => + grobidRowCount.inc + r + } + val crossrefPipe : TypedPipe[(String, ReduceFeatures)] = crossrefScorable + .getInputPipe(args) + .map { r => + crossrefRowCount.inc + r } - } - def setScorable2(s: Scorable) { - scorable2 = Some(s) - } + val joinedPipe = grobidPipe + .addTrap(TypedTsv(args("output") + ".trapped")) + .join(crossrefPipe) + + /* TODO: + // Reduces to count unique SHA1 and DOI + joinedPipe + .map { case (_, (grobidFeatures, _)) => grobidFeatures.sha } + .distinct + .map { _ => uniqueSha1Count.inc } + joinedPipe + .map { case (_, (_, crossrefFeatures)) => crossrefFeatures.doi } + .distinct + .map { _ => uniqueDoiCount.inc } + */ - def getScorable2() : Scorable = { - scorable2 match { - case Some(s) => s - case None => null + // TypedTsv doesn't work over case classes. + joinedPipe + .map { case (slug, (grobidFeatures, crossrefFeatures)) => + joinedRowCount.inc + //val (slug : String, (grobidFeatures: ReduceFeatures, crossrefFeatures: ReduceFeatures)) = entry + new ReduceOutput( + slug, + Scorable.computeSimilarity(grobidFeatures, crossrefFeatures), + grobidFeatures.json, + crossrefFeatures.json) } - } + .map { entry => (entry.slug, entry.score, entry.json1, entry.json2) } + .write(TypedTsv[(String, Int, String, String)](args("output"))) } - */ diff --git a/scalding/src/main/scala/sandcrawler/StringUtilities.scala b/scalding/src/main/scala/sandcrawler/StringUtilities.scala index 2745875..e03b60d 100644 --- a/scalding/src/main/scala/sandcrawler/StringUtilities.scala +++ b/scalding/src/main/scala/sandcrawler/StringUtilities.scala @@ -36,7 +36,7 @@ object StringUtilities { // Source: https://stackoverflow.com/a/30076541/631051 def removePunctuation(s: String) : String = { - s.replaceAll("""[\p{Punct}]""", "") + s.replaceAll("""[\p{Punct}’·“”‘’“”«»「」]""", "") } // Adapted from: https://stackoverflow.com/a/16018452/631051 |