diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 11:14:22 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-21 11:14:22 -0700 | 
| commit | 96f45740599a9d9ed06cf22eba8cd0a0e0927a42 (patch) | |
| tree | f9064ccd26c02a813b128e36827418a1f5840ba3 /scalding | |
| parent | 34fa226b27a8597ae1da788a41be2880b1cbf4fc (diff) | |
| parent | f73796fe22d96e1d5ad559ffcb8dfe8fc10b3c20 (diff) | |
| download | sandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.tar.gz sandcrawler-96f45740599a9d9ed06cf22eba8cd0a0e0927a42.zip | |
Merge branch 'strings'
Resolved conflicts in:
    scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
Diffstat (limited to 'scalding')
4 files changed, 65 insertions, 9 deletions
| diff --git a/scalding/src/main/resources/slug-blacklist.txt b/scalding/src/main/resources/slug-blacklist.txt new file mode 100644 index 0000000..7dc701f --- /dev/null +++ b/scalding/src/main/resources/slug-blacklist.txt @@ -0,0 +1,34 @@ +abbreviations +abstract +acknowledgements +article +authorreply +authorsreply +bookreview +bookreviews +casereport +commentary +commentaryon +commenton +commentto +contents +correspondence +dedication +editorialadvisoryboard +focus +hypothesis +inbrief +introduction +introductiontotheissue +lettertotheeditor +listofabbreviations +note +overview +preface +references +results +review +reviewarticle +summary +title +name diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index e71abfa..0b9868a 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -1,8 +1,16 @@  package sandcrawler +import java.io.InputStream + +import scala.io.Source  import scala.util.parsing.json.JSONObject  object ScorableFeatures { +  // TODO: Add exception handling. +  val fileStream : InputStream = getClass.getResourceAsStream("/slug-blacklist.txt") +  val SlugBlacklist : Set[String] = Source.fromInputStream(fileStream).getLines.toSet +  fileStream.close +    // Static factory method    def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {      new ScorableFeatures( @@ -16,14 +24,6 @@ object ScorableFeatures {  // Contains features needed to make slug and to score (in combination  // with a second ScorableFeatures). Create with above static factory method.  class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { -  val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", -    "article", "authorreply", "authorsreply", "bookreview", "bookreviews", -    "casereport", "commentary", "commentaryon", "commenton", "commentto", -    "contents", "correspondence", "dedication", "editorialadvisoryboard", -    "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", -    "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", -    "references", "results", "review", "reviewarticle", "summary", "title", -    "name")    def toMap() : Map[String, Any] =      Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) @@ -38,7 +38,7 @@ class ScorableFeatures private(title : String, year: Int = 0, doi : String = "",        val unaccented = StringUtilities.removeAccents(title)        // Remove punctuation        val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") -      if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug +      if (slug.isEmpty || slug == null || (ScorableFeatures.SlugBlacklist contains slug)) Scorable.NoSlug else slug      }    } diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 8a293fe..5a22ef8 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -1,9 +1,30 @@  package sandcrawler +import java.io.InputStream + +import scala.io.Source +  import org.scalatest._  // scalastyle:off null  class ScorableFeaturesTest extends FlatSpec with Matchers { + +  // TODO: Remove this when we're convinced that our file-reading code +  // works. (I'm already convinced. --Ellen) +  "read slugs" should "work" in { +    val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", +      "article", "authorreply", "authorsreply", "bookreview", "bookreviews", +      "casereport", "commentary", "commentaryon", "commenton", "commentto", +      "contents", "correspondence", "dedication", "editorialadvisoryboard", +      "focus", "hypothesis", "inbrief", "introduction", "introductiontotheissue", +      "lettertotheeditor", "listofabbreviations", "note", "overview", "preface", +      "references", "results", "review", "reviewarticle", "summary", "title", +      "name") + +    ScorableFeatures.SlugBlacklist.size shouldBe SlugBlacklist.size +    for (s <- ScorableFeatures.SlugBlacklist) SlugBlacklist should contain (s) +  } +    private def titleToSlug(s : String) : String = {      ScorableFeatures.create(title = s).toSlug    } diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index 55ae614..00e4659 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -163,6 +163,7 @@ class ScoreJobTest extends FlatSpec with Matchers {    // scalastyle:off null    // Add example of lines without GROBID data +  // scalastyle:off null    val SampleData = SampleDataHead :+ new Tuple(      new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null)    // scalastyle:on null | 
