diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-20 15:16:43 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-20 15:16:43 -0700 |
commit | f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20 (patch) | |
tree | 454c1becaba40ec9ea6d5c8a349dee050d3ed03d /scalding | |
parent | af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (diff) | |
download | sandcrawler-f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20.tar.gz sandcrawler-f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20.zip |
Reads blacklist from file.
Diffstat (limited to 'scalding')
4 files changed, 77 insertions, 18 deletions
diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt new file mode 100644 index 0000000..7dc701f --- /dev/null +++ b/scalding/src/main/resources/slug-blacklist.txt @@ -0,0 +1,34 @@ +abbreviations +abstract +acknowledgements +article +authorreply +authorsreply +bookreview +bookreviews +casereport +commentary +commentaryon +commenton +commentto +contents +correspondence +dedication +editorialadvisoryboard +focus +hypothesis +inbrief +introduction +introductiontotheissue +lettertotheeditor +listofabbreviations +note +overview +preface +references +results +review +reviewarticle +summary +title +name diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..610f1a4 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -1,26 +1,18 @@ package sandcrawler -import scala.util.parsing.json.JSONObject +import java.io.InputStream +import scala.io.Source +import scala.util.parsing.json.JSONObject // Contains features needed to make slug and to score (in combination // with a second ScorableFeatures). class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - - val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", - "article", "authorreply", "authorsreply", "bookreview", "bookreviews", - "casereport", "commentary", "commentaryon", "commenton", "commentto", - "contents", "correspondence", "dedication", "editorialadvisoryboard", - "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", - "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", - "references", "results", "review", "reviewarticle", "summary", "title", - "name") - - def toMap() : Map[String, Any] = { - Map("title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) + def toMap() : Map[String, Any] = { Map( + "title" -> (if (title == null) "" else title), + "year" -> year, + "doi" -> (if (doi == null) "" else doi), + "sha1" -> (if (sha1 == null) "" else sha1)) } override def toString() : String = { @@ -34,11 +26,22 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug + // scalastyle:off if.brace + if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) + Scorable.NoSlug + else + slug } } - def toMapFeatures = { + def toMapFeatures : MapFeatures = { MapFeatures(toSlug, toString) } } + +object ScorableFeatures { + // TODO: Add exception handling. + val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") + val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet + fileStream.close +} diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 80d92aa..a9a90ec 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -1,9 +1,30 @@ package sandcrawler +import java.io.InputStream + +import scala.io.Source + import org.scalatest._ // scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { + + // TODO: Remove this when we're convinced that our file-reading code + // works. (I'm already convinced. --Ellen) + "read slugs" should "work" in { + val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", + "article", "authorreply", "authorsreply", "bookreview", "bookreviews", + "casereport", "commentary", "commentaryon", "commenton", "commentto", + "contents", "correspondence", "dedication", "editorialadvisoryboard", + "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", + "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", + "references", "results", "review", "reviewarticle", "summary", "title", + "name") + + ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size + for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s) + } + private def titleToSlug(s : String) : String = { new ScorableFeatures(title = s).toSlug } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index f92ba31..5516869 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -162,6 +162,7 @@ class ScoreJobTest extends FlatSpec with Matchers { .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) } // Add example of lines without GROBID data + // scalastyle:off null val SampleData = SampleDataHead :+ new Tuple( new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null) |