diff options
Diffstat (limited to 'scalding/src')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala | 20 | ||||
-rw-r--r-- | scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala | 80 |
2 files changed, 91 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala index d3e78fe..30f76a0 100644 --- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala +++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala @@ -26,14 +26,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv .map('tei_json -> 'slug) { json : String => HBaseCrossrefScore.grobidToSlug(json)} - /* val crossrefSource = TextLine(args("input")) val crossrefPipe = crossrefSource .read .map('line -> 'slug) { - json : String => crossrefToSlug(json)} - + json : String => HBaseCrossrefScore.crossrefToSlug(json)} +/* statusPipe.groupBy { identity } .size .debug @@ -56,7 +55,20 @@ object HBaseCrossrefScore { } } + def crossrefToSlug(json : String) : Option[String] = { + val jsonObject = JSON.parseFull(json) + if (jsonObject == None) { + None + } else { + val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]] + globalMap.get("title") match { + case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0))) + case None => None + } + } + } + def titleToSlug(title : String) : String = { - title.split(":")(0) + title.split(":")(0).toLowerCase() } } diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala index ab6a798..8bdc7a8 100644 --- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala +++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala @@ -53,27 +53,97 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers { val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle") val MalformedGrobidString = GrobidString.replace("}", "") + val CrossrefString = +""" +{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" }, + "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ], + "date-time" : "2017-10-23T17:19:16Z", + "timestamp" : { "$numberLong" : "1508779156477" } }, + "reference-count" : 0, + "publisher" : "Elsevier BV", + "issue" : "3", + "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/", + "start" : { "date-parts" : [ [ 1996, 1, 1 ] ], + "date-time" : "1996-01-01T00:00:00Z", + "timestamp" : { "$numberLong" : "820454400000" } }, + "delay-in-days" : 0, "content-version" : "tdm" }], + "content-domain" : { "domain" : [], "crossmark-restriction" : false }, + "published-print" : { "date-parts" : [ [ 1996 ] ] }, + "DOI" : "10.1016/0987-7983(96)87729-2", + "type" : "journal-article", + "created" : { "date-parts" : [ [ 2002, 7, 25 ] ], + "date-time" : "2002-07-25T15:09:41Z", + "timestamp" : { "$numberLong" : "1027609781000" } }, + "page" : "186-187", + "source" : "Crossref", + "is-referenced-by-count" : 0, + "title" : [ "les ferments lactiques: classification, propriétés, utilisations agroalimentaires" ], + "prefix" : "10.1016", + "volume" : "9", + "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ], + "member" : "78", + "container-title" : [ "Journal de Pédiatrie et de Puériculture" ], + "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml", + "content-type" : "text/xml", + "content-version" : "vor", + "intended-application" : "text-mining" }, + { "URL" : + "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain", + "content-type" : "text/plain", + "content-version" : "vor", + "intended-application" : "text-mining" } ], + "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ], + "date-time" : "2015-09-03T10:03:43Z", + "timestamp" : { "$numberLong" : "1441274623000" } }, + "score" : 1, + "issued" : { "date-parts" : [ [ 1996 ] ] }, + "references-count" : 0, + "alternative-id" : [ "0987-7983(96)87729-2" ], + "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2", + "ISSN" : [ "0987-7983" ], + "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ], + "subject" : [ "Pediatrics, Perinatology, and Child Health" ] +} +""" + val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle") + val MalformedCrossrefString = CrossrefString.replace("}", "") + "titleToSlug()" should "extract the parts of titles before a colon" in { - val slug = HBaseCrossrefScore.titleToSlug("hello:there") + val slug = HBaseCrossrefScore.titleToSlug("HELLO:there") slug shouldBe "hello" } it should "extract an entire colon-less string" in { - val slug = HBaseCrossrefScore.titleToSlug("hello there") + val slug = HBaseCrossrefScore.titleToSlug("hello THERE") slug shouldBe "hello there" } "grobidToSlug()" should "get the right slug for a grobid json string" in { val slug = HBaseCrossrefScore.grobidToSlug(GrobidString) - slug should contain ("Dummy Example File") + slug should contain ("dummy example file") } - "grobidToSlug()" should "return None if given json string without title" in { + it should "return None if given json string without title" in { val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle) slug shouldBe None } - "grobidToSlug()" should "return None if given a malformed json string" in { + it should "return None if given a malformed json string" in { val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString) slug shouldBe None } + + "crossrefToSlug()" should "get the right slug for a crossref json string" in { + val slug = HBaseCrossrefScore.crossrefToSlug(CrossrefString) + slug should contain ("les ferments lactiques") + } + + it should "return None if given json string without title" in { + val slug = HBaseCrossrefScore.grobidToSlug(CrossrefStringWithoutTitle) + slug shouldBe None + } + + it should "return None if given a malformed json string" in { + val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString) + slug shouldBe None + } } |