From 309f40b66d474f12c0cfe60c449d43ae4bacb912 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 27 Aug 2018 16:35:52 -0700 Subject: basic crossref subtitle concatination support --- .../main/scala/sandcrawler/CrossrefScorable.scala | 23 +++++++++++++++++++++- .../scala/sandcrawler/CrossrefScorableTest.scala | 18 +++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'scalding') diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 039fa85..f51c210 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -60,7 +60,28 @@ object CrossrefScorable { if (titles.isEmpty || titles == null) { None } else { - val title = titles(0) + val baseTitle: String = titles(0) + // TODO(bnewbold): this code block is horrible + val baseSubtitle: String = if (map contains "subtitle") { + val subtitles = map("subtitle").asInstanceOf[List[String]] + if (!subtitles.isEmpty && subtitles != null) { + val sub = subtitles(0) + if (sub != null && !sub.isEmpty && baseTitle != null) { + sub + } else { + "" + } + } else { + "" + } + } else { + "" + } + val title = if (baseSubtitle.isEmpty) { + baseTitle + } else { + baseTitle.concat(": ".concat(baseSubtitle)) + } if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) } } else { diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index f598cae..c0d1cb0 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -98,6 +98,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshortjustright" + } + + it should "handle empty subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshort" + } + + it should "handle null subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshort" + } + it should "handle missing authors" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) result.slug shouldBe Scorable.NoSlug -- cgit v1.2.3