diff options
author | bnewbold <bnewbold@archive.org> | 2018-08-20 22:02:16 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2018-08-20 22:02:16 +0000 |
commit | 34fa226b27a8597ae1da788a41be2880b1cbf4fc (patch) | |
tree | 9aaa8365fb3facf5d88dabafdd61e70d7484f0ac /scalding/src/main | |
parent | af0fa6edf3c21ac38a8ab4e0fb425e5471e6c3b6 (diff) | |
parent | 4f4571bbc1717c5ad9740377fdb8297da6632639 (diff) | |
download | sandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.tar.gz sandcrawler-34fa226b27a8597ae1da788a41be2880b1cbf4fc.zip |
Merge branch 'little-things' into 'master'
Small clean-up
See merge request webgroup/sandcrawler!16
Diffstat (limited to 'scalding/src/main')
3 files changed, 21 insertions, 18 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index ff8201a..5d1eaf5 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -35,7 +35,7 @@ object CrossrefScorable { new MapFeatures(Scorable.NoSlug, json) } else { // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = new ScorableFeatures(title=titles(0), doi=doi) + val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) new MapFeatures(sf.toSlug, sf.toString) } } else { diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 9a09e05..d7a1eea 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -45,7 +45,7 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - new ScorableFeatures(Scorable.getString(map, "title"), sha1=key).toMapFeatures + ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } diff --git a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala index 8ed3369..e71abfa 100644 --- a/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala +++ b/scalding/src/main/scala/sandcrawler/ScorableFeatures.scala @@ -2,12 +2,21 @@ package sandcrawler import scala.util.parsing.json.JSONObject +object ScorableFeatures { + // Static factory method + def create(title : String, year : Int = 0, doi : String = "", sha1 : String = "") : ScorableFeatures = { + new ScorableFeatures( + title=if (title == null) "" else title, + year=year, + doi=if (doi == null) "" else doi, + sha1=if (sha1 == null) "" else sha1) + } +} // Contains features needed to make slug and to score (in combination -// with a second ScorableFeatures). -class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: String = "") { - - val slugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", +// with a second ScorableFeatures). Create with above static factory method. +class ScorableFeatures private(title : String, year: Int = 0, doi : String = "", sha1: String = "") { + val SlugBlacklist = Set( "abbreviations", "abstract", "acknowledgements", "article", "authorreply", "authorsreply", "bookreview", "bookreviews", "casereport", "commentary", "commentaryon", "commenton", "commentto", "contents", "correspondence", "dedication", "editorialadvisoryboard", @@ -16,16 +25,11 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S "references", "results", "review", "reviewarticle", "summary", "title", "name") - def toMap() : Map[String, Any] = { - Map("title" -> (if (title == null) "" else title), - "year" -> year, - "doi" -> (if (doi == null) "" else doi), - "sha1" -> (if (sha1 == null) "" else sha1)) - } + def toMap() : Map[String, Any] = + Map("title" -> title, "year" -> year, "doi" -> doi, "sha1" -> sha1) - override def toString() : String = { - JSONObject(toMap()).toString - } + override def toString() : String = + JSONObject(toMap).toString def toSlug() : String = { if (title == null) { @@ -34,11 +38,10 @@ class ScorableFeatures(title : String, year: Int = 0, doi : String = "", sha1: S val unaccented = StringUtilities.removeAccents(title) // Remove punctuation val slug = StringUtilities.removePunctuation((unaccented.toLowerCase())).replaceAll("\\s", "") - if (slug.isEmpty || slug == null || (slugBlacklist contains slug)) Scorable.NoSlug else slug + if (slug.isEmpty || slug == null || (SlugBlacklist contains slug)) Scorable.NoSlug else slug } } - def toMapFeatures = { + def toMapFeatures : MapFeatures = MapFeatures(toSlug, toString) - } } |