diff options
Diffstat (limited to 'scalding/src/main/scala/sandcrawler/ScorableFeatures.scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/ScorableFeatures.scala | 63 |
1 files changed, 63 insertions, 0 deletions
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala new file mode 100644 index 0000000..93cd78d --- /dev/null +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -0,0 +1,63 @@ +package sandcrawler + +import java.io.InputStream + +import scala.io.Source +import scala.util.parsing.json.JSONArray +import scala.util.parsing.json.JSONObject + +object ScorableFeatures { + // TODO: Add exception handling. + val fileStream : InputStream = getClass.getResourceAsStream("/slug-denylist.txt") + val SlugDenylist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet + fileStream.close + val MinSlugLength = 8 + + // Static factory method + def create(title : String, authors : List[Any] = List(), year : Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + authors=if (authors == null) List() else authors.map(a => if (a == null) "" else a), + year=year, + doi=if (doi == null) "" else doi, + fatcat_release=if (fatcat_release == null) "" else fatcat_release, + fatcat_work=if (fatcat_work == null) "" else fatcat_work, + sha1=if (sha1 == null) "" else sha1) + } +} + +// Contains features needed to make slug and to score (in combination +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, authors : List[Any] = List(), year: Int = 0, doi : String = "", fatcat_release : String = "", fatcat_work : String = "", sha1: String = "") { + + def toMap() : Map[String, Any] = + Map("title" -> title, "authors" -> JSONArray(authors), "year" -> year, "doi" -> doi, "fatcat_release" -> fatcat_release, "fatcat_work" -> fatcat_work, "sha1" -> sha1) + + override def toString() : String = { + JSONObject(toMap).toString + } + + def toSlug() : Option[String] = { + if (title == null) { + None + } else { + val unaccented = StringUtilities.removeAccents(title) + // Remove punctuation + val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") + if (slug.isEmpty + || slug == null + || (ScorableFeatures.SlugDenylist contains slug) + || (slug.length < ScorableFeatures.MinSlugLength)) { + None + } else { + Some(slug) + } + } + } + + def toMapFeatures : Option[MapFeatures] = + toSlug match { + case None => None + case Some(slug) => Some(MapFeatures(slug, toString)) + } +} |