You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/18 00:16:56 UTC
svn commit: r785836 - in
/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors:
Driver.java TF.java
Author: gsingers
Date: Wed Jun 17 22:16:56 2009
New Revision: 785836
URL: http://svn.apache.org/viewvc?rev=785836&view=rev
Log:
MAHOUT-126: Add TF weight and Driver support
Added:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=785836&r1=785835&r2=785836&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Wed Jun 17 22:16:56 2009
@@ -80,6 +80,10 @@
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
withDescription("The output of the dictionary").withShortName("t").create();
+ Option weightOpt = obuilder.withLongName("weight").withRequired(true).withArgument(
+ abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).
+ withDescription("The kind of weight to use. Currently TF or TFIDF").withShortName("w").create();
+
Option delimiterOpt = obuilder.withLongName("delimiter").withRequired(false).withArgument(
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).
withDescription("The delimiter for outputing the dictionary").withShortName("l").create();
@@ -93,7 +97,8 @@
Option helpOpt = obuilder.withLongName("help").
withDescription("Print out help").withShortName("h").create();
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
- .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).create();
+ .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt)
+ .withOption(weightOpt).create();
try {
Parser parser = new Parser();
parser.setGroup(group);
@@ -117,7 +122,17 @@
}
Directory dir = FSDirectory.open(file);
IndexReader reader = IndexReader.open(dir, true);
- Weight weight = new TFIDF();
+ Weight weight = null;
+ if(cmdLine.hasOption(weightOpt)) {
+ String wString = cmdLine.getValue(weightOpt).toString();
+ if(wString.equalsIgnoreCase("tf")) {
+ weight = new TF();
+ } else if (wString.equalsIgnoreCase("tfidf")) {
+ weight = new TFIDF();
+ } else {
+ throw new OptionException(weightOpt);
+ }
+ }
String field = cmdLine.getValue(fieldOpt).toString();
TermInfo termInfo = new CachedTermInfo(reader, field, 1, 99);
VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java?rev=785836&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/TF.java Wed Jun 17 22:16:56 2009
@@ -0,0 +1,35 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
+
+
+/**
+ * {@link org.apache.mahout.utils.vectors.Weight} based on term frequency only
+ *
+ **/
+public class TF implements Weight {
+
+ @Override
+ public double calculate(int tf, int df, int length, int numDocs) {
+ //ignore length
+ return tf;
+ }
+}