aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
authorbnewbold <bnewbold@archive.org>2018-08-20 22:02:16 +0000
committerbnewbold <bnewbold@archive.org>2018-08-20 22:02:16 +0000
commit34fa226b27a8597ae1da788a41be2880b1cbf4fc (patch)
tree9aaa8365fb3facf5d88dabafdd61e70d7484f0ac /scalding/src/main
parentaf0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (diff)
parent4f4571bbc1717c5ad9740377fdb8297da6632639 (diff)
downloadsandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.tar.gz
sandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.zip
Merge branch 'little-things' into 'master'
Small clean-up See merge request webgroup/sandcrawler!16
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala2
-rw-r--r--scalding/src/main/scala/sandcrawler/ScorableFeatures.scala35
3 files changed, 21 insertions, 18 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index ff8201a..5d1eaf5 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -35,7 +35,7 @@ object CrossrefScorable {
new MapFeatures(Scorable.NoSlug, json)
} else {
// bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ]
- val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi)
+ val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi)
new MapFeatures(sf.toSlug, sf.toString)
}
} else {
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 9a09e05..d7a1eea 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -45,7 +45,7 @@ object GrobidScorable {
case None => MapFeatures(Scorable.NoSlug, json)
case Some(map) => {
if (map contains "title") {
- new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures
+ ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures
} else {
MapFeatures(Scorable.NoSlug, json)
}
diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
index 8ed3369..e71abfa 100644
--- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
+++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala
@@ -2,12 +2,21 @@ package sandcrawler
import scala.util.parsing.json.JSONObject
+object ScorableFeatures {
+ // Static factory method
+ def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = {
+ new ScorableFeatures(
+ title=if (title == null) "" else title,
+ year=year,
+ doi=if (doi == null) "" else doi,
+ sha1=if (sha1 == null) "" else sha1)
+ }
+}
// Contains features needed to make slug and to score (in combination
-// with a second ScorableFeatures).
-class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
-
- val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
+// with a second ScorableFeatures). Create with above static factory method.
+class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") {
+ val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements",
"article", "authorreply", "authorsreply", "bookreview", "bookreviews",
"casereport", "commentary", "commentaryon", "commenton", "commentto",
"contents", "correspondence", "dedication", "editorialadvisoryboard",
@@ -16,16 +25,11 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
"references", "results", "review", "reviewarticle", "summary", "title",
"name")
- def toMap() : Map[String, Any] = {
- Map("title" -> (if (title == null) "" else title),
- "year" -> year,
- "doi" -> (if (doi == null) "" else doi),
- "sha1" -> (if (sha1 == null) "" else sha1))
- }
+ def toMap() : Map[String, Any] =
+ Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1)
- override def toString() : String = {
- JSONObject(toMap()).toString
- }
+ override def toString() : String =
+ JSONObject(toMap).toString
def toSlug() : String = {
if (title == null) {
@@ -34,11 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S
val unaccented = StringUtilities.removeAccents(title)
// Remove punctuation
val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "")
- if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug
+ if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug
}
}
- def toMapFeatures = {
+ def toMapFeatures : MapFeatures =
MapFeatures(toSlug, toString)
- }
}