From 6ea7b7fdb9330e69afbbe2d2afe3e6b8c83fb4fb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 23 Aug 2018 17:50:43 -0700 Subject: author parsing (and year, for crossref) --- scalding/src/main/scala/sandcrawler/GrobidScorable.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'scalding/src/main/scala/sandcrawler/GrobidScorable.scala') diff --git a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala index 76f4f22..c55cb40 100644 --- a/scalding/src/main/scala/sandcrawler/GrobidScorable.scala +++ b/scalding/src/main/scala/sandcrawler/GrobidScorable.scala @@ -51,6 +51,16 @@ object GrobidScorable { } } + def mapToAuthorList(map : Map[String, Any]) : List[String] = { + if (map contains "authors") { + val objArray = map("authors").asInstanceOf[List[Any]].map(e => e.asInstanceOf[Map[String,Any]]) + objArray + .filter(e => e contains "name") + .map(e => e.get("name").get.asInstanceOf[String]) + } else { + List() + } + } def getHBaseSource(table : String, host : String) : HBaseSource = { HBaseBuilder.build(table, host, List("grobid0:metadata", "grobid0:status_code"), SourceMode.SCAN_ALL) @@ -61,7 +71,9 @@ object GrobidScorable { case None => MapFeatures(Scorable.NoSlug, json) case Some(map) => { if (map contains "title") { - ScorableFeatures.create(title=Scorable.getString(map, "title"), sha1=key).toMapFeatures + val authors: List[String] = mapToAuthorList(map) + val title = Scorable.getString(map, "title") + ScorableFeatures.create(title=title, authors=authors, sha1=key).toMapFeatures } else { MapFeatures(Scorable.NoSlug, json) } -- cgit v1.2.3