aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-27 16:35:52 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-27 16:35:52 -0700
commit309f40b66d474f12c0cfe60c449d43ae4bacb912 (patch)
tree510aa40e89d1563f98abd435d07f376487c626ac
parentf8a0c99b270ebcd6e239c6e26190cf7200389ced (diff)
downloadsandcrawler-309f40b66d474f12c0cfe60c449d43ae4bacb912.tar.gz
sandcrawler-309f40b66d474f12c0cfe60c449d43ae4bacb912.zip
basic crossref subtitle concatination support
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala23
-rw-r--r--scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala18
2 files changed, 40 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 039fa85..f51c210 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -60,7 +60,28 @@ object CrossrefScorable {
if (titles.isEmpty || titles == null) {
None
} else {
- val title = titles(0)
+ val baseTitle: String = titles(0)
+ // TODO(bnewbold): this code block is horrible
+ val baseSubtitle: String = if (map contains "subtitle") {
+ val subtitles = map("subtitle").asInstanceOf[List[String]]
+ if (!subtitles.isEmpty && subtitles != null) {
+ val sub = subtitles(0)
+ if (sub != null && !sub.isEmpty && baseTitle != null) {
+ sub
+ } else {
+ ""
+ }
+ } else {
+ ""
+ }
+ } else {
+ ""
+ }
+ val title = if (baseSubtitle.isEmpty) {
+ baseTitle
+ } else {
+ baseTitle.concat(": ".concat(baseSubtitle))
+ }
if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title)
}
} else {
diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
index f598cae..c0d1cb0 100644
--- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
+++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala
@@ -98,6 +98,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers {
result.slug shouldBe Scorable.NoSlug
}
+ it should "handle subtitle" in {
+ val result = CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
+ result.slug shouldBe "shortbutnottooshortjustright"
+ }
+
+ it should "handle empty subtitle" in {
+ val result = CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
+ result.slug shouldBe "shortbutnottooshort"
+ }
+
+ it should "handle null subtitle" in {
+ val result = CrossrefScorable.jsonToMapFeatures(
+ """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""")
+ result.slug shouldBe "shortbutnottooshort"
+ }
+
it should "handle missing authors" in {
val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors)
result.slug shouldBe Scorable.NoSlug