diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 16:35:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-27 16:35:52 -0700 |
commit | 309f40b66d474f12c0cfe60c449d43ae4bacb912 (patch) | |
tree | 510aa40e89d1563f98abd435d07f376487c626ac | |
parent | f8a0c99b270ebcd6e239c6e26190cf7200389ced (diff) | |
download | sandcrawler-309f40b66d474f12c0cfe60c449d43ae4bacb912.tar.gz sandcrawler-309f40b66d474f12c0cfe60c449d43ae4bacb912.zip |
basic crossref subtitle concatination support
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 23 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala | 18 |
2 files changed, 40 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 039fa85..f51c210 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -60,7 +60,28 @@ object CrossrefScorable { if (titles.isEmpty || titles == null) { None } else { - val title = titles(0) + val baseTitle: String = titles(0) + // TODO(bnewbold): this code block is horrible + val baseSubtitle: String = if (map contains "subtitle") { + val subtitles = map("subtitle").asInstanceOf[List[String]] + if (!subtitles.isEmpty && subtitles != null) { + val sub = subtitles(0) + if (sub != null && !sub.isEmpty && baseTitle != null) { + sub + } else { + "" + } + } else { + "" + } + } else { + "" + } + val title = if (baseSubtitle.isEmpty) { + baseTitle + } else { + baseTitle.concat(": ".concat(baseSubtitle)) + } if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) } } else { diff --git a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala index f598cae..c0d1cb0 100644 --- a/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala +++ b/scalding/src/test/scala/sandcrawler/CrossrefScorableTest.scala @@ -98,6 +98,24 @@ class CrossrefScorableTest extends FlatSpec with Matchers { result.slug shouldBe Scorable.NoSlug } + it should "handle subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": ["just right!"], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshortjustright" + } + + it should "handle empty subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [""], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshort" + } + + it should "handle null subtitle" in { + val result = CrossrefScorable.jsonToMapFeatures( + """{"title": ["short but not too short"], "subtitle": [null], "DOI": "10.123/asdf", "type":"journal-article", "author":[{ "given" : "W", "family" : "Gaier"}]}""") + result.slug shouldBe "shortbutnottooshort" + } + it should "handle missing authors" in { val result = CrossrefScorable.jsonToMapFeatures(CrossrefStringWithNoAuthors) result.slug shouldBe Scorable.NoSlug |