aboutsummaryrefslogtreecommitdiffstats
path: root/scalding/src/main
diff options
context:
space:
mode:
Diffstat (limited to 'scalding/src/main')
-rw-r--r--scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala57
1 files changed, 36 insertions, 21 deletions
diff --git a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
index 2a569a1..01d852e 100644
--- a/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
+++ b/scalding/src/main/scala/sandcrawler/HBaseCrossrefScoreJob.scala
@@ -76,7 +76,7 @@ class HBaseCrossrefScoreJob(args: Args) extends JobBase(args) with HBasePipeConv
((slug0: String, sha1 : String, grobidJson : String),
(slug1 : String, crossrefJson : String))) = entry
HBaseCrossrefScore.computeOutput(sha1, grobidJson, crossrefJson)}
- // Output: score, sha1, doi, grobid title, crossref title
+ // Output: score, sha1, doi, grobid title, crossref title
.write(TypedTsv[(Int, String, String, String, String)](args("output")))
}
@@ -134,22 +134,7 @@ object HBaseCrossrefScore {
}
}
- val FullTitleMatch = 100
- val TitleLeftMatchBase = 50
- val MaxTitleLeftMatch = 80
- val SlugMatch = 25
-
- def computeSimilarity(gTitle : String, cTitle : String) : Int = {
- assert(titleToSlug(gTitle) == titleToSlug(cTitle))
- if (gTitle == cTitle) {
- FullTitleMatch
- } else if (gTitle.startsWith(cTitle) || cTitle.startsWith(gTitle)) {
- math.min(TitleLeftMatchBase + math.abs(gTitle.length - cTitle.length),
- MaxTitleLeftMatch)
- } else {
- SlugMatch
- }
- }
+ val MaxScore = 1000
def computeOutput(sha1 : String, grobidJson : String, crossrefJson : String) :
// (score, sha1, doi, grobidTitle, crossrefTitle)
@@ -164,7 +149,7 @@ object HBaseCrossrefScore {
case Some(crossref) => {
val crossrefTitle = crossref("title").asInstanceOf[List[String]](0).toLowerCase()
- (computeSimilarity(grobidTitle, crossrefTitle),
+ (similarity(removeAccents(grobidTitle), removeAccents(crossrefTitle)),
sha1,
crossref("DOI").asInstanceOf[String],
"'" + grobidTitle + "'",
@@ -175,9 +160,7 @@ object HBaseCrossrefScore {
}
}
- // scalastyle:off
// Adapted from https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=blob;f=src/main/java/org/apache/commons/lang3/StringUtils.java;h=1d7b9b99335865a88c509339f700ce71ce2c71f2;hb=HEAD#l934
- // scalastyle:on
def removeAccents(s : String) : String = {
val replacements = Map(
'\u0141' -> 'L',
@@ -195,7 +178,39 @@ object HBaseCrossrefScore {
}
}
val pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+")
- pattern.matcher(sb).replaceAll("").toString
+ pattern.matcher(sb).replaceAll("")
+ }
+
+ // Adapted from: https://stackoverflow.com/a/16018452/631051
+ def similarity(s1 : String, s2 : String) : Int = {
+ val longer : String = if (s1.length > s2.length) s1 else s2
+ val shorter : String = if (s1.length > s2.length) s2 else s1
+ if (longer.length == 0) {
+ // Both strings are empty.
+ MaxScore
+ } else {
+ (longer.length - stringDistance(longer, shorter)) * MaxScore / longer.length
+ }
+ }
+
+ // Source: // https://oldfashionedsoftware.com/2009/11/19/string-distance-and-refactoring-in-scala/
+ def stringDistance(s1: String, s2: String): Int = {
+ val memo = scala.collection.mutable.Map[(List[Char],List[Char]),Int]()
+ def min(a:Int, b:Int, c:Int) = Math.min( Math.min( a, b ), c)
+ def sd(s1: List[Char], s2: List[Char]): Int = {
+ if (!memo.contains((s1, s2))) {
+ memo((s1,s2)) = (s1, s2) match {
+ case (_, Nil) => s1.length
+ case (Nil, _) => s2.length
+ case (c1::t1, c2::t2) =>
+ min( sd(t1,s2) + 1, sd(s1,t2) + 1,
+ sd(t1,t2) + (if (c1==c2) 0 else 1) )
+ }
+ }
+ memo((s1,s2))
+ }
+
+ sd( s1.toList, s2.toList )
}
}