diff options
author | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 13:17:49 -0700 |
---|---|---|
committer | Ellen Spertus <ellen.spertus@gmail.com> | 2018-08-22 13:17:49 -0700 |
commit | 9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33 (patch) | |
tree | 16680eb1bed750fac2980bda70ced41df28d98c0 /scalding/src/main/scala | |
parent | 9cc24a40509f62b789ff1fa97913bef32589a288 (diff) | |
download | sandcrawler-9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33.tar.gz sandcrawler-9fb9a35b15ed9b553ad4f938dc4e636e5d91ac33.zip |
Added title-length filtering to CrossrefScorable.
Diffstat (limited to 'scalding/src/main/scala')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 50 |
1 files changed, 37 insertions, 13 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5d1eaf5..0431c63 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -19,29 +19,53 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) + .filter { CrossrefScorable.keepRecord(_) } .map { CrossrefScorable.jsonToMapFeatures(_) } } } object CrossrefScorable { + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + mapToTitle(map) match { + case None => false + case Some(title) => title.length <= Scorable.MaxTitleLength + } + } + } + } + + // Returns None if title is null, empty, or too long. + def mapToTitle(map : Map[String, Any]) : Option[String] = { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty || titles == null) { + None + } else { + val title = titles(0) + if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) + } + } else None + } + def jsonToMapFeatures(json : String) : MapFeatures = { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) - case Some(map) => { - if ((map contains "title") && (map contains "DOI")) { - val titles = map("title").asInstanceOf[List[String]] - val doi = Scorable.getString(map, "DOI") - if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { - new MapFeatures(Scorable.NoSlug, json) - } else { - // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) - new MapFeatures(sf.toSlug, sf.toString) + case Some(map) => + mapToTitle(map) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(title) => { + val doi = Scorable.getString(map, "DOI") + if (doi.isEmpty || doi == null) { + MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) + MapFeatures(sf.toSlug, sf.toString) + } } - } else { - new MapFeatures(Scorable.NoSlug, json) } - } } } } |