aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-21 11:14:22 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-21 11:14:22 -0700
commit96f45740599a9d9ed06cf22eba8cd0a0e0927a42 (patch)
treef9064ccd26c02a813b128e36827418a1f5840ba3 /scalding
parent34fa226b27a8597ae1da788a41be2880b1cbf4fc (diff)
parentf73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20 (diff)
downloadsandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.tar.gz
sandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.zip
Merge branch 'strings'
Resolved conflicts in: scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/resources/slug-blacklist.txt34
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala18
-rw-r--r--scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala21
-rw-r--r--scalding/src/test/scala/sandcrawler/ScoreJobTest.scala1
4 files changed, 65 insertions, 9 deletions
diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt
new file mode 100644
index 0000000..7dc701f
--- /dev/null
+++ b/scalding/src/main/resources/slug-blacklist.txt
@@ -0,0 +1,34 @@
+abbreviations
+abstract
+acknowledgements
+article
+authorreply
+authorsreply
+bookreview
+bookreviews
+casereport
+commentary
+commentaryon
+commenton
+commentto
+contents
+correspondence
+dedication
+editorialadvisoryboard
+focus
+hypothesis
+inbrief
+introduction
+introductiontotheissue
+lettertotheeditor
+listofabbreviations
+note
+overview
+preface
+references
+results
+review
+reviewarticle
+summary
+title
+name
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index e71abfa..0b9868a 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -1,8 +1,16 @@
package sandcrawler
+import java.io.InputStream
+
+import scala.io.Source
import scala.util.parsing.json.JSONObject
object ScorableFeatures {
+ // TODO: Add exception handling.
+ val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt")
+ val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet
+ fileStream.close
+
// Static factory method
def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
new ScorableFeatures(
@@ -16,14 +24,6 @@ object ScorableFeatures {
// Contains features needed to make slug and to score (in combination
// with a second ScorableFeatures). Create with above static factory method.
class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
- val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
- "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
- "casereport", "commentary", "commentaryon", "commenton", "commentto",
- "contents", "correspondence", "dedication", "editorialadvisoryboard",
- "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
- "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
- "references", "results", "review", "reviewarticle", "summary", "title",
- "name")
def toMap() : Map[String, Any] =
Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
@@ -38,7 +38,7 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug
}
}
diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
index 8a293fe..5a22ef8 100644
--- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala
@@ -1,9 +1,30 @@
package sandcrawler
+import java.io.InputStream
+
+import scala.io.Source
+
import org.scalatest._
// scalastyle:off null
class ScorableFeaturesTest extends FlatSpec with Matchers {
+
+ // TODO: Remove this when we're convinced that our file-reading code
+ // works. (I'm already convinced. --Ellen)
+ "read slugs" should "work" in {
+ val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
+ "article", "authorreply", "authorsreply", "bookreview", "bookreviews",
+ "casereport", "commentary", "commentaryon", "commenton", "commentto",
+ "contents", "correspondence", "dedication", "editorialadvisoryboard",
+ "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue",
+ "lettertotheeditor", "listofabbreviations", "note", "overview", "preface",
+ "references", "results", "review", "reviewarticle", "summary", "title",
+ "name")
+
+ ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size
+ for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s)
+ }
+
private def titleToSlug(s : String) : String = {
ScorableFeatures.create(title = s).toSlug
}
diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
index 55ae614..00e4659 100644
--- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
+++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala
@@ -163,6 +163,7 @@ class ScoreJobTest extends FlatSpec with Matchers {
// scalastyle:off null
// Add example of lines without GROBID data
+ // scalastyle:off null
val SampleData = SampleDataHead :+ new Tuple(
new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)
// scalastyle:on null