diff options
| author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 | 
|---|---|---|
| committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-07 11:05:23 -0700 | 
| commit | 8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64 (patch) | |
| tree | f515c25882aebeb5edb8d8a13e06e457e19a4fb4 /scalding/src/main | |
| parent | 408123177b9e8afd145ea0f0fa1d6bb449f1bd20 (diff) | |
| download | sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.tar.gz sandcrawler-8dc3bf5c6f68d1fffa9f940ba1024ed95e76ed64.zip | |
Added GrobidScorableTest, minor improvements.
Diffstat (limited to 'scalding/src/main')
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 19 | ||||
| -rw-r--r-- | scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 24 | 
2 files changed, 33 insertions, 10 deletions
| diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 0849aff..cf5849c 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -15,10 +15,27 @@ class CrossrefScorable extends Scorable {        .read        .toTypedPipe[String](new Fields("line"))        .map{ json : String => -        HBaseCrossrefScore.crossrefToSlug(json) match { +        CrossrefScorable.crossrefToSlug(json) match {            case Some(slug) => new MapFeatures(slug, json)            case None => new MapFeatures(Scorable.NoSlug, json)          }        }    }  } + +object CrossrefScorable { +  def crossrefToSlug(json : String) : Option[String] = { +    Scorable.jsonToMap(json) match { +      case None => None +      case Some(map) => { +        if (map contains "title") { +          // TODO: Don't ignore titles after the first. +          val title = map("title").asInstanceOf[List[String]](0) +          Some(Scorable.titleToSlug(title)) +        } else { +          None +        } +      } +    } +  } +} diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 8da7708..25e5985 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -32,14 +32,20 @@ class GrobidScorable extends Scorable with HBasePipeConversions {        }      }    } -/* -  def fromBytesWritableLocal(f: Fields): Pipe = { -	asList(f) -	  .foldLeft(pipe) { (p, fld) => { -	    p.map(fld.toString -> fld.toString) { from: org.apache.hadoop.hbase.io.ImmutableBytesWritable => -            Option(from).map(x => Bytes.toString(x.get)).getOrElse(null) -          } -      }} +} + +object GrobidScorable { +  def grobidToSlug(json : String) : Option[String] = { +    Scorable.jsonToMap(json) match { +      case None => None +      case Some(map) => { +        if (map contains "title") { +          Some(Scorable.titleToSlug(map("title").asInstanceOf[String])) +        } else { +          None +        } +      } +    }    } - */  } + | 
