aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main/scala
diff options
context:
space:
mode:
authorEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
committerEllen Spertus <ellen.spertus@gmail.com>2018-08-07 11:05:23 -0700
commit8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch)
treef515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src/main/scala
parent408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff)
downloadsandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz
sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r--scalding/src/main/scala/sandcrawler/CrossrefScorable.scala19
-rw-r--r--scalding/src/main/scala/sandcrawler/GrobidScorable.scala24
2 files changed, 33 insertions, 10 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
index 0849aff..cf5849c 100644
--- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala
@@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable {
.read
.toTypedPipe[String](new Fields("line"))
.map{ json : String =>
- HBaseCrossrefScore.crossrefToSlug(json) match {
+ CrossrefScorable.crossrefToSlug(json) match {
case Some(slug) => new MapFeatures(slug, json)
case None => new MapFeatures(Scorable.NoSlug, json)
}
}
}
}
+
+object CrossrefScorable {
+ def crossrefToSlug(json : String) : Option[String] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ // TODO: Don't ignore titles after the first.
+ val title = map("title").asInstanceOf[List[String]](0)
+ Some(Scorable.titleToSlug(title))
+ } else {
+ None
+ }
+ }
+ }
+ }
+}
diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
index 8da7708..25e5985 100644
--- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
+++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala
@@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {
}
}
}
-/*
- def fromBytesWritableLocal(f: Fields): Pipe = {
- asList(f)
- .foldLeft(pipe) { (p, fld) => {
- p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable =>
- Option(from).map(x => Bytes.toString(x.get)).getOrElse(null)
- }
- }}
+}
+
+object GrobidScorable {
+ def grobidToSlug(json : String) : Option[String] = {
+ Scorable.jsonToMap(json) match {
+ case None => None
+ case Some(map) => {
+ if (map contains "title") {
+ Some(Scorable.titleToSlug(map("title").asInstanceOf[String]))
+ } else {
+ None
+ }
+ }
+ }
}
- */
}
+