diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 10:27:48 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-13 10:27:48 -0700 |
commit | b4f1acce5eccbb56291f82906d9c01534c7f1506 (patch) | |
tree | 96ff33ed95a4eb9304280b1d5f1ccb269c0d0424 /scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | |
parent | 1c6e1234974d8b6e4480a13ff5c4ff861c6d1deb (diff) | |
download | sandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.tar.gz sandcrawler-b4f1acce5eccbb56291f82906d9c01534c7f1506.zip |
Factored out ScorableFeatures.
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..5d6dea0 --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,30 @@ +package sandcrawler + +import scala.util.parsing.json.JSONObject + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). +class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + def toMap() : Map[String, Any] = { + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) + } + + override def toString() : String = { + JSONObject(toMap()).toString + } + + def toSlug() : String = { + if (title == null) { + Scorable.NoSlug + } else { + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation after splitting on colon. + val slug = StringUtilities.removePunctuation((unaccented.split(":")(0).toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty || slug == null) Scorable.NoSlug else slug + } + } + + def toMapFeatures = { + MapFeatures(toSlug, toString) + } +} |