diff options
author | bnewbold <bnewbold@archive.org> | 2018-08-20 22:02:16 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2018-08-20 22:02:16 +0000 |
commit | 34fa226b27a8597ae1da788a41be2880b1cbf4fc (patch) | |
tree | 9aaa8365fb3facf5d88dabafdd61e70d7484f0ac /scalding | |
parent | af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (diff) | |
parent | 4f4571bbc1717c5ad9740377fdb8297da6632639 (diff) | |
download | sandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.tar.gz sandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.zip |
Merge branch 'little-things' into 'master'
Small clean-up
See merge request webgroup/sandcrawler!16
Diffstat (limited to 'scalding')
6 files changed, 37 insertions, 32 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ff8201a..5d1eaf5 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -35,7 +35,7 @@ object CrossrefScorable { new MapFeatures(Scorable.NoSlug, json) } else { // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } } else { diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 9a09e05..d7a1eea 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -45,7 +45,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures + ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..e71abfa 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,12 +2,21 @@ package sandcrawler import scala.util.parsing.json.JSONObject +object ScorableFeatures { + // Static factory method + def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + year=year, + doi=if (doi == null) "" else doi, + sha1=if (sha1 == null) "" else sha1) + } +} // Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - - val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", "article", "authorreply", "authorsreply", "bookreview", "bookreviews", "casereport", "commentary", "commentaryon", "commenton", "commentto", "contents", "correspondence", "dedication", "editorialadvisoryboard", @@ -16,16 +25,11 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S "references", "results", "review", "reviewarticle", "summary", "title", "name") - def toMap() : Map[String, Any] = { - Map("title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) - } + def toMap() : Map[String, Any] = + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = { - JSONObject(toMap()).toString - } + override def toString() : String = + JSONObject(toMap).toString def toSlug() : String = { if (title == null) { @@ -34,11 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug } } - def toMapFeatures = { + def toMapFeatures : MapFeatures = MapFeatures(toSlug, toString) - } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala index 0da0b9c..3291670 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseStatusCountTest.scala @@ -29,17 +29,17 @@ class HBaseStatusCountTest extends FunSpec with TupleConversions { val statusType1Bytes = Bytes.toBytes(statusType1) val statusType2Bytes = Bytes.toBytes(statusType2) - val sampleData : List[List[Array[Byte]]] = List( - // TODO(bnewbold): now to express a null (empty value) in this list? - List(Bytes.toBytes("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q"), statusType1Bytes), - List(Bytes.toBytes("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU"), statusType1Bytes), - List(Bytes.toBytes("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT"), statusType2Bytes), - List(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56"), statusType2Bytes), - List(Bytes.toBytes("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ"), statusType2Bytes), - List(Bytes.toBytes("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6"), statusType2Bytes), - List(Bytes.toBytes("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ"), statusType1Bytes), - List(Bytes.toBytes("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT"), statusType2Bytes) - ) + // TODO(bnewbold): now to express a null (empty value) in this list? + val sampleData : List[List[Array[Byte]]] = List( + ("sha1:K2DKSSVTXWPRMFDTWSTCQW3RVWRIOV3Q", statusType1Bytes), + ("sha1:C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ3WU", statusType1Bytes), + ("sha1:SDKUVHC3YNNEGH5WAG5ZAAXWAEBNX4WT", statusType2Bytes), + ("sha1:35985C3YNNEGH5WAG5ZAAXWAEBNXJW56", statusType2Bytes), + ("sha1:885C3YNNEGH5WAG5ZAAXWA8BNXJWT6CZ", statusType2Bytes), + ("sha1:00904C3YNNEGH5WAG5ZA9XWAEBNXJWT6", statusType2Bytes), + ("sha1:249C3YNNEGH5WAG5ZAAXWAEBNXJWT6CZ", statusType1Bytes), + ("sha1:095893C3YNNEGH5WAG5ZAAXWAEBNXJWT", statusType2Bytes)) + .map(pair => List(Bytes.toBytes(pair._1), pair._2)) val statusType1Count = sampleData.count(lst => lst(1) == statusType1Bytes) val statusType2Count = sampleData.count(lst => lst(1) == statusType2Bytes) diff --git a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala index 80d92aa..8a293fe 100644 --- a/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScorableFeaturesTest.scala @@ -5,12 +5,12 @@ import org.scalatest._ // scalastyle:off null class ScorableFeaturesTest extends FlatSpec with Matchers { private def titleToSlug(s : String) : String = { - new ScorableFeatures(title = s).toSlug + ScorableFeatures.create(title = s).toSlug } "toMapFeatures()" should "work with gnarly inputs" in { - new ScorableFeatures(title = null).toMapFeatures - new ScorableFeatures(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures + ScorableFeatures.create(title = null).toMapFeatures + ScorableFeatures.create(title = "something", doi = null, sha1 = null, year = 123).toMapFeatures } "mapToSlug()" should "extract the parts of titles before a colon" in { diff --git a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala index f92ba31..55ae614 100644 --- a/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala +++ b/scalding/src/test/scala/sandcrawler/ScoreJobTest.scala @@ -161,9 +161,11 @@ class ScoreJobTest extends FlatSpec with Matchers { .map { case (sha, json, status) => List(Bytes.toBytes(sha), Bytes.toBytes(json), Bytes.toBytes(status)) } .map { l => new Tuple(l.map(s => {new ImmutableBytesWritable(s)}):_*) } + // scalastyle:off null // Add example of lines without GROBID data val SampleData = SampleDataHead :+ new Tuple( new ImmutableBytesWritable(Bytes.toBytes("sha1:35985C3YNNEGH5WAG5ZAA88888888888")), null, null) + // scalastyle:on null JobTest("sandcrawler.ScoreJob") .arg("test", "") |