diff options
author | bnewbold <bnewbold@archive.org> | 2018-08-23 22:55:39 +0000 |
---|---|---|
committer | bnewbold <bnewbold@archive.org> | 2018-08-23 22:55:39 +0000 |
commit | c6e9aa4226aa8ed02c80e829ddb1d3fd40103017 (patch) | |
tree | 7cadfce40b8e1873d95609bfeff41181ef5ac308 /scalding/src/main | |
parent | 03968da99d24d81e0224712056d1dea38cb8c70e (diff) | |
parent | 6b401b34f189475efb84e72dafa2124ac50b5ee8 (diff) | |
download | sandcrawler-c6e9aa4226aa8ed02c80e829ddb1d3fd40103017.tar.gz sandcrawler-c6e9aa4226aa8ed02c80e829ddb1d3fd40103017.zip |
Merge branch 'ellen-length-filtering' into 'master'
Filtering titles by length
See merge request webgroup/sandcrawler!21
Diffstat (limited to 'scalding/src/main')
3 files changed, 56 insertions, 13 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index 5d1eaf5..ab33d03 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -19,29 +19,55 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { def getFeaturesPipe(args : Args)(implicit mode : Mode, flowDef : FlowDef) : TypedPipe[MapFeatures] = { getSource(args).read .toTypedPipe[String](new Fields("line")) + .filter { CrossrefScorable.keepRecord(_) } .map { CrossrefScorable.jsonToMapFeatures(_) } } } object CrossrefScorable { + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + mapToTitle(map) match { + case None => false + case Some(title) => title.length <= Scorable.MaxTitleLength + } + } + } + } + + // Returns None if title is null, empty, or too long. + def mapToTitle(map : Map[String, Any]) : Option[String] = { + if (map contains "title") { + val titles = map("title").asInstanceOf[List[String]] + if (titles.isEmpty || titles == null) { + None + } else { + val title = titles(0) + if (title == null || title.isEmpty || title.length > Scorable.MaxTitleLength) None else Some(title) + } + } else { + None + } + } + def jsonToMapFeatures(json : String) : MapFeatures = { Scorable.jsonToMap(json) match { case None => MapFeatures(Scorable.NoSlug, json) - case Some(map) => { - if ((map contains "title") && (map contains "DOI")) { - val titles = map("title").asInstanceOf[List[String]] - val doi = Scorable.getString(map, "DOI") - if (titles.isEmpty || titles == null || doi.isEmpty || doi == null) { - new MapFeatures(Scorable.NoSlug, json) - } else { - // bnewbold: not checking that titles(0) is non-null/non-empty; case would be, in JSON, "title": [ null ] - val sf : ScorableFeatures = ScorableFeatures.create(title=titles(0), doi=doi) - new MapFeatures(sf.toSlug, sf.toString) + case Some(map) => + mapToTitle(map) match { + case None => MapFeatures(Scorable.NoSlug, json) + case Some(title) => { + val doi = Scorable.getString(map, "DOI") + if (doi.isEmpty || doi == null) { + MapFeatures(Scorable.NoSlug, json) + } else { + val sf : ScorableFeatures = ScorableFeatures.create(title=title, doi=doi) + MapFeatures(sf.toSlug, sf.toString) + } } - } else { - new MapFeatures(Scorable.NoSlug, json) } - } } } } diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index e510f75..76f4f22 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -31,11 +31,27 @@ class GrobidScorable extends Scorable with HBasePipeConversions { } // TODO: Should I combine next two stages for efficiency? .collect { case (key, json, StatusOK) => (key, json) } + .filter { case (key, json) => GrobidScorable.keepRecord(json) } .map { entry : (String, String) => GrobidScorable.jsonToMapFeatures(entry._1, entry._2) } } } object GrobidScorable { + def keepRecord(json : String) : Boolean = { + Scorable.jsonToMap(json) match { + case None => false + case Some(map) => { + if (map contains "title") { + val title = Scorable.getString(map, "title") + title != null && title.length <= Scorable.MaxTitleLength + } else { + false + } + } + } + } + + def getHBaseSource(table : String, host : String) : HBaseSource = { HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) } diff --git a/scalding/src/main/scala/sandcrawler/Scorable.scala b/scalding/src/main/scala/sandcrawler/Scorable.scala index 9b9c633..c704ed9 100644 --- a/scalding/src/main/scala/sandcrawler/Scorable.scala +++ b/scalding/src/main/scala/sandcrawler/Scorable.scala @@ -30,6 +30,7 @@ abstract class Scorable { } object Scorable { + val MaxTitleLength = 255 val NoSlug = "NO SLUG" // Used for slug if title is empty or unparseable def isValidSlug(slug : String) : Boolean = { |