You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/03/13 23:48:45 UTC
mahout git commit: Fixed incorrect MLlibTFIDF IDF calculation
Repository: mahout
Updated Branches:
refs/heads/master d5ea1f1be -> 9b169e7e7
Fixed incorrect MLlibTFIDF IDF calculation
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/9b169e7e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/9b169e7e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/9b169e7e
Branch: refs/heads/master
Commit: 9b169e7e710f1a4883a15b652ad8ece86a88960f
Parents: d5ea1f1
Author: Andrew Palumbo <ap...@apache.org>
Authored: Fri Mar 13 18:47:31 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Fri Mar 13 18:47:31 2015 -0400
----------------------------------------------------------------------
.../src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala | 8 ++++----
.../scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala | 6 +++---
2 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
index 5b78e18..c75ff20 100644
--- a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
@@ -36,7 +36,7 @@ class TFIDF extends TermWeight {
*
* Lucene 4.6's DefaultSimilarity TF-IDF calculation uses the formula:
*
- * sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+ * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
*
* Note: this is consistent with the MapReduce seq2sparse implementation of TF-IDF weights
* and is slightly different from Spark MLlib's TD-IDF calculation which is implemented as:
@@ -52,7 +52,7 @@ class TFIDF extends TermWeight {
def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
// Lucene 4.6 DefaultSimilarity's TF-IDF is implemented as:
- // sqrt(tf) * (log(numDocs / (df+1)) + 1)
+ // sqrt(tf) * (log(numDocs / (df + 1)) + 1)
math.sqrt(tf) * (math.log(numDocs / (df + 1).toDouble) + 1.0)
}
}
@@ -69,7 +69,7 @@ class MLlibTFIDF extends TermWeight {
* Note: this is not consistent with the MapReduce seq2sparse implementation of TF-IDF weights
* which is implemented using Lucene DefaultSimilarity's TF-IDF calculation:
*
- * sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+ * sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
*
* @param tf term freq
* @param df doc freq
@@ -81,7 +81,7 @@ class MLlibTFIDF extends TermWeight {
// Spark MLLib's TF-IDF weight is implemented as:
// termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
- tf * (math.log((numDocs + 1.0) / (df + 1).toDouble) + 1.0)
+ tf * math.log((numDocs + 1.0) / (df + 1).toDouble)
}
}
http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
index a0dec26..3ec5ec1 100644
--- a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
@@ -176,9 +176,9 @@ trait TFIDFtestBase extends DistributedMahoutSuite with Matchers {
// 11 -> 2, 8 -> 1, 4 -> 1)
abs(vectorizedDocuments(0, 0) - 0.0) should be < epsilon
- abs(vectorizedDocuments(0, 13) - 2.609437) should be < epsilon
- abs(vectorizedDocuments(1, 3) - 4.197224) should be < epsilon
- abs(vectorizedDocuments(3, 3) - 6.295836) should be < epsilon
+ abs(vectorizedDocuments(0, 13) - 1.609437) should be < epsilon
+ abs(vectorizedDocuments(1, 3) - 2.197224) should be < epsilon
+ abs(vectorizedDocuments(3, 3) - 3.295836) should be < epsilon
}
}
\ No newline at end of file