aboutsummaryrefslogtreecommitdiffstats
path: root/scalding
diff options
context:
space:
mode:
Diffstat (limited to 'scalding')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala20
-rw-r--r--scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala80
2 files changed, 91 insertions, 9 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index d3e78fe..30f76a0 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -26,14 +26,13 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
.map('tei_json -> 'slug) {
json : String => HBaseCrossrefScore.grobidToSlug(json)}
- /*
val crossrefSource = TextLine(args("input"))
val crossrefPipe = crossrefSource
.read
.map('line -> 'slug) {
- json : String => crossrefToSlug(json)}
-
+ json : String => HBaseCrossrefScore.crossrefToSlug(json)}
+/*
statusPipe.groupBy { identity }
.size
.debug
@@ -56,7 +55,20 @@ object HBaseCrossrefScore {
}
}
+ def crossrefToSlug(json : String) : Option[String] = {
+ val jsonObject = JSON.parseFull(json)
+ if (jsonObject == None) {
+ None
+ } else {
+ val globalMap = jsonObject.get.asInstanceOf[Map[String, Any]]
+ globalMap.get("title") match {
+ case Some(title) => Some(titleToSlug(title.asInstanceOf[List[String]](0)))
+ case None => None
+ }
+ }
+ }
+
def titleToSlug(title : String) : String = {
- title.split(":")(0)
+ title.split(":")(0).toLowerCase()
}
}
diff --git a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
index ab6a798..8bdc7a8 100644
--- a/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
+++ b/scalding/src/test/scala/sandcrawler/HBaseCrossrefScoreTest.scala
@@ -53,27 +53,97 @@ class HBaseCrossrefScoreTest extends FlatSpec with Matchers {
val GrobidStringWithoutTitle = GrobidString.replace("title", "nottitle")
val MalformedGrobidString = GrobidString.replace("}", "")
+ val CrossrefString =
+"""
+{ "_id" : { "$oid" : "5a553d5988a035a45bf50ed3" },
+ "indexed" : { "date-parts" : [ [ 2017, 10, 23 ] ],
+ "date-time" : "2017-10-23T17:19:16Z",
+ "timestamp" : { "$numberLong" : "1508779156477" } },
+ "reference-count" : 0,
+ "publisher" : "Elsevier BV",
+ "issue" : "3",
+ "license" : [ { "URL" : "http://www.elsevier.com/tdm/userlicense/1.0/",
+ "start" : { "date-parts" : [ [ 1996, 1, 1 ] ],
+ "date-time" : "1996-01-01T00:00:00Z",
+ "timestamp" : { "$numberLong" : "820454400000" } },
+ "delay-in-days" : 0, "content-version" : "tdm" }],
+ "content-domain" : { "domain" : [], "crossmark-restriction" : false },
+ "published-print" : { "date-parts" : [ [ 1996 ] ] },
+ "DOI" : "10.1016/0987-7983(96)87729-2",
+ "type" : "journal-article",
+ "created" : { "date-parts" : [ [ 2002, 7, 25 ] ],
+ "date-time" : "2002-07-25T15:09:41Z",
+ "timestamp" : { "$numberLong" : "1027609781000" } },
+ "page" : "186-187",
+ "source" : "Crossref",
+ "is-referenced-by-count" : 0,
+ "title" : [ "les ferments lactiques: classification, propriétés, utilisations agroalimentaires" ],
+ "prefix" : "10.1016",
+ "volume" : "9",
+ "author" : [ { "given" : "W", "family" : "Gaier", "affiliation" : [] } ],
+ "member" : "78",
+ "container-title" : [ "Journal de Pédiatrie et de Puériculture" ],
+ "link" : [ { "URL" : "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/xml",
+ "content-type" : "text/xml",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" },
+ { "URL" :
+ "http://api.elsevier.com/content/article/PII:0987-7983(96)87729-2?httpAccept=text/plain",
+ "content-type" : "text/plain",
+ "content-version" : "vor",
+ "intended-application" : "text-mining" } ],
+ "deposited" : { "date-parts" : [ [ 2015, 9, 3 ] ],
+ "date-time" : "2015-09-03T10:03:43Z",
+ "timestamp" : { "$numberLong" : "1441274623000" } },
+ "score" : 1,
+ "issued" : { "date-parts" : [ [ 1996 ] ] },
+ "references-count" : 0,
+ "alternative-id" : [ "0987-7983(96)87729-2" ],
+ "URL" : "http://dx.doi.org/10.1016/0987-7983(96)87729-2",
+ "ISSN" : [ "0987-7983" ],
+ "issn-type" : [ { "value" : "0987-7983", "type" : "print" } ],
+ "subject" : [ "Pediatrics, Perinatology, and Child Health" ]
+}
+"""
+ val CrossrefStringWithoutTitle = CrossrefString.replace("title", "nottitle")
+ val MalformedCrossrefString = CrossrefString.replace("}", "")
+
"titleToSlug()" should "extract the parts of titles before a colon" in {
- val slug = HBaseCrossrefScore.titleToSlug("hello:there")
+ val slug = HBaseCrossrefScore.titleToSlug("HELLO:there")
slug shouldBe "hello"
}
it should "extract an entire colon-less string" in {
- val slug = HBaseCrossrefScore.titleToSlug("hello there")
+ val slug = HBaseCrossrefScore.titleToSlug("hello THERE")
slug shouldBe "hello there"
}
"grobidToSlug()" should "get the right slug for a grobid json string" in {
val slug = HBaseCrossrefScore.grobidToSlug(GrobidString)
- slug should contain ("Dummy Example File")
+ slug should contain ("dummy example file")
}
- "grobidToSlug()" should "return None if given json string without title" in {
+ it should "return None if given json string without title" in {
val slug = HBaseCrossrefScore.grobidToSlug(GrobidStringWithoutTitle)
slug shouldBe None
}
- "grobidToSlug()" should "return None if given a malformed json string" in {
+ it should "return None if given a malformed json string" in {
val slug = HBaseCrossrefScore.grobidToSlug(MalformedGrobidString)
slug shouldBe None
}
+
+ "crossrefToSlug()" should "get the right slug for a crossref json string" in {
+ val slug = HBaseCrossrefScore.crossrefToSlug(CrossrefString)
+ slug should contain ("les ferments lactiques")
+ }
+
+ it should "return None if given json string without title" in {
+ val slug = HBaseCrossrefScore.grobidToSlug(CrossrefStringWithoutTitle)
+ slug shouldBe None
+ }
+
+ it should "return None if given a malformed json string" in {
+ val slug = HBaseCrossrefScore.grobidToSlug(MalformedCrossrefString)
+ slug shouldBe None
+ }
}