You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/18 00:16:56 UTC

svn commit: r785836 - in /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors: Driver.java TF.java

Author: gsingers
Date: Wed Jun 17 22:16:56 2009
New Revision: 785836

URL: http://svn.apache.org/viewvc?rev=785836&view=rev
Log:
MAHOUT-126: Add TF weight and Driver support

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=785836&r1=785835&r2=785836&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Wed Jun 17 22:16:56 2009
@@ -80,6 +80,10 @@
             abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
             withDescription("The output of the dictionary").withShortName("t").create();
 
+    Option weightOpt = obuilder.withLongName("weight").withRequired(true).withArgument(
+            abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).
+            withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+
     Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
             abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
             withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
@@ -93,7 +97,8 @@
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
     Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
-            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).create();
+            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt)
+            .withOption(weightOpt).create();
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
@@ -117,7 +122,17 @@
           }
           Directory dir = FSDirectory.open(file);
           IndexReader reader = IndexReader.open(dir, true);
-          Weight weight = new TFIDF();
+          Weight weight = null;
+          if(cmdLine.hasOption(weightOpt)) {
+            String wString = cmdLine.getValue(weightOpt).toString();
+            if(wString.equalsIgnoreCase("tf")) {
+              weight = new TF();
+            } else if (wString.equalsIgnoreCase("tfidf")) {
+              weight = new TFIDF();
+            } else {
+              throw new OptionException(weightOpt);
+            }
+          }
           String field = cmdLine.getValue(fieldOpt).toString();
           TermInfo termInfo = new CachedTermInfo(reader, field, 1, 99);
           VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java?rev=785836&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java Wed Jun 17 22:16:56 2009
@@ -0,0 +1,35 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
+
+
+/**
+ * {@link org.apache.mahout.utils.vectors.Weight} based on term frequency only 
+ *
+ **/
+public class TF implements Weight {
+
+  @Override
+  public double calculate(int tf, int df, int length, int numDocs) {
+    //ignore length
+    return tf;
+  }
+}