You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/03/13 23:48:45 UTC

mahout git commit: Fixed incorrect MLlibTFIDF IDF calculation

Repository: mahout
Updated Branches:
  refs/heads/master d5ea1f1be -> 9b169e7e7


Fixed incorrect MLlibTFIDF IDF calculation


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/9b169e7e
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/9b169e7e
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/9b169e7e

Branch: refs/heads/master
Commit: 9b169e7e710f1a4883a15b652ad8ece86a88960f
Parents: d5ea1f1
Author: Andrew Palumbo <ap...@apache.org>
Authored: Fri Mar 13 18:47:31 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Fri Mar 13 18:47:31 2015 -0400

----------------------------------------------------------------------
 .../src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala   | 8 ++++----
 .../scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala    | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
index 5b78e18..c75ff20 100644
--- a/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
+++ b/math-scala/src/main/scala/org/apache/mahout/nlp/tfidf/TFIDF.scala
@@ -36,7 +36,7 @@ class TFIDF extends TermWeight {
    *
    * Lucene 4.6's DefaultSimilarity TF-IDF calculation uses the formula:
    *
-   *   sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+   *   sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
    *
    * Note: this is consistent with the MapReduce seq2sparse implementation of TF-IDF weights
    * and is slightly different from Spark MLlib's TD-IDF calculation which is implemented as:
@@ -52,7 +52,7 @@ class TFIDF extends TermWeight {
   def calculate(tf: Int, df: Int, length: Int, numDocs: Int): Double = {
 
     // Lucene 4.6 DefaultSimilarity's TF-IDF is implemented as:
-    // sqrt(tf) * (log(numDocs / (df+1)) + 1)
+    // sqrt(tf) * (log(numDocs / (df + 1)) + 1)
     math.sqrt(tf) * (math.log(numDocs / (df + 1).toDouble) + 1.0)
   }
 }
@@ -69,7 +69,7 @@ class MLlibTFIDF extends TermWeight {
    * Note: this is not consistent with the MapReduce seq2sparse implementation of TF-IDF weights
    * which is implemented using Lucene DefaultSimilarity's TF-IDF calculation:
    *
-   *   sqrt(termFreq) * log((numDocs / (docFreq + 1)) + 1.0)
+   *   sqrt(termFreq) * (log(numDocs / (docFreq + 1)) + 1.0)
    *
    * @param tf term freq
    * @param df doc freq
@@ -81,7 +81,7 @@ class MLlibTFIDF extends TermWeight {
 
     // Spark MLLib's TF-IDF weight is implemented as:
     // termFreq * log((numDocs + 1.0) / (docFreq + 1.0))
-    tf * (math.log((numDocs + 1.0) / (df + 1).toDouble) + 1.0)
+    tf *  math.log((numDocs + 1.0) / (df + 1).toDouble)
   }
 }
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/9b169e7e/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
----------------------------------------------------------------------
diff --git a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
index a0dec26..3ec5ec1 100644
--- a/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
+++ b/math-scala/src/test/scala/org/apache/mahout/nlp/tfidf/TFIDFtestBase.scala
@@ -176,9 +176,9 @@ trait TFIDFtestBase extends DistributedMahoutSuite with Matchers {
     //   11 -> 2, 8 -> 1, 4 -> 1)
 
     abs(vectorizedDocuments(0, 0) -  0.0) should be < epsilon
-    abs(vectorizedDocuments(0, 13) - 2.609437) should be < epsilon
-    abs(vectorizedDocuments(1, 3) - 4.197224) should be < epsilon
-    abs(vectorizedDocuments(3, 3) - 6.295836) should be < epsilon
+    abs(vectorizedDocuments(0, 13) - 1.609437) should be < epsilon
+    abs(vectorizedDocuments(1, 3) - 2.197224) should be < epsilon
+    abs(vectorizedDocuments(3, 3) - 3.295836) should be < epsilon
   }
 
 }
\ No newline at end of file