You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2010/01/02 23:45:49 UTC
svn commit: r895306 - in
/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils:
clustering/ClusterDumper.java vectors/VectorHelper.java
Author: gsingers
Date: Sat Jan 2 22:45:49 2010
New Revision: 895306
URL: http://svn.apache.org/viewvc?rev=895306&view=rev
Log:
ClusterDumper cleanups plus some printing help
Added:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (with props)
Modified:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=895306&r1=895305&r2=895306&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Jan 2 22:45:49 2010
@@ -36,7 +36,7 @@
import org.apache.mahout.clustering.ClusterBase;
import org.apache.mahout.math.Vector;
import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.utils.vectors.VectorHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -49,20 +49,18 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
-import java.util.regex.Pattern;
public final class ClusterDumper {
private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
private static final String LINE_SEP = System.getProperty("line.separator");
- private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
String seqFileDir;
String pointsDir;
@@ -70,7 +68,8 @@
String outputFile;
int subString = Integer.MAX_VALUE;
Map<String, List<String>> clusterIdToPoints = null;
-
+ private boolean useJSON = false;
+
public ClusterDumper(String seqFileDir, String pointsDir) throws IOException {
this.seqFileDir = seqFileDir;
this.pointsDir = pointsDir;
@@ -92,9 +91,9 @@
JobConf conf = new JobConf(Job.class);
client.setConf(conf);
- ArrayList<String> dictionary = null;
+ String[] dictionary = null;
if (this.termDictionary != null) {
- dictionary = getTermDict(this.termDictionary);
+ dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
}
Writer writer = null;
@@ -122,7 +121,7 @@
ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
while (reader.next(key, value)){
Vector center = value.getCenter();
- String fmtStr = center.asFormatString();
+ String fmtStr = useJSON == false ? VectorHelper.vectorToString(center, dictionary) : center.asFormatString();
writer.append("Id: ").append(String.valueOf(value.getId())).append(":").append("name:")
.append(center.getName()).append(":").append(fmtStr.substring(0, Math.min(subString, fmtStr.length()))).append(LINE_SEP);
@@ -192,6 +191,8 @@
Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
+ Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).
+ withDescription("Output the centroid as JSON. Otherwise it substitues in the terms for vector cell entries").withShortName("j").create();
Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).
withDescription("The directory containing points sequence files mapping input vectors to their cluster. " +
@@ -202,7 +203,7 @@
Option helpOpt = obuilder.withLongName("help").
withDescription("Print out help").withShortName("h").create();
- Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(dictOpt).create();
+ Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).create();
try {
@@ -235,9 +236,11 @@
if (cmdLine.hasOption(substringOpt)) {
sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
}
-
ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
-
+ if (cmdLine.hasOption(centroidJSonOpt)) {
+ clusterDumper.setUseJSON(true);
+ }
+
if (outputFile != null) {
clusterDumper.setOutputFile(outputFile);
}
@@ -254,6 +257,10 @@
}
}
+ private void setUseJSON(boolean json) {
+ this.useJSON = json;
+ }
+
private static Map<String, List<String>> readPoints(String pointsPathDir, JobConf conf) throws IOException {
SortedMap<String, List<String>> result = new TreeMap<String, List<String>>();
@@ -295,31 +302,7 @@
return result;
}
- private static ArrayList<String> getTermDict(String dictFile) throws IOException {
- FileLineIterator it = new FileLineIterator(new File(dictFile));
- int numEntries = Integer.parseInt(it.next());
- System.out.println(numEntries);
- ArrayList<String> result = new ArrayList<String>();
-
- for (int i = 0; i < numEntries; i++) {
- result.add("dummyentry");
- }
-
- while (it.hasNext()) {
- String line = it.next();
- if (line.startsWith("#")) {
- continue;
- }
- String[] tokens = TAB_PATTERN.split(line);
- if (tokens.length < 3) {
- continue;
- }
- int index = Integer.parseInt(tokens[2]);
- result.set(index, tokens[0]);
- }
- return result;
- }
static class TermIndexWeight {
public int index = -1;
@@ -331,7 +314,7 @@
}
}
- private static String getTopFeatures(Vector vector, ArrayList<String> dictionary, int numTerms) {
+ private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();
@@ -353,7 +336,7 @@
for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
int index = vectorTerms.get(i).index;
- String dictTerm = dictionary.get(index);
+ String dictTerm = dictionary[index];
if (dictTerm == null) {
log.error("Dictionary entry missing for "+ index);
continue;
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=895306&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sat Jan 2 22:45:49 2010
@@ -0,0 +1,104 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.math.Vector;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class VectorHelper {
+ private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
+
+ /**
+ * Create a String from a vector that fills in the values with the appropriate value from a dictionary where each the ith entry is the term for the ith vector cell..
+ * @param vector
+ * @param dictionary The dictionary. See
+ * @return The String
+ */
+ public static String vectorToString(Vector vector, String [] dictionary){
+ StringBuilder bldr = new StringBuilder(2048);
+ String name = vector.getName();
+ if (name != null && name.equals("") == false) {
+ bldr.append("Name: ").append(name).append(' ');
+ }
+ bldr.append("elts: {");
+ Iterator<Vector.Element> iter = vector.iterateNonZero();
+ boolean first = true;
+ while (iter.hasNext()) {
+ if (first == true){
+ first = false;
+ } else {
+ bldr.append(", ");
+ }
+ Vector.Element elt = (Vector.Element) iter.next();
+ bldr.append(elt.index()).append(':').append(dictionary[elt.index()]);
+
+ }
+ return bldr.toString();
+ }
+
+
+ /**
+ * Read in a dictionary file. Format is:
+ * <pre>term DocFreq Index</pre>
+ * @param dictFile
+ * @return
+ * @throws IOException
+ */
+ public static String [] loadTermDictionary(File dictFile) throws IOException {
+ return loadTermDictionary(new FileInputStream(dictFile));
+ }
+
+ /**
+ * Read in a dictionary file. Format is:
+ * First line is the number of entries
+ * <pre>term DocFreq Index</pre>
+ */
+ public static String [] loadTermDictionary(InputStream is) throws IOException {
+ FileLineIterator it = new FileLineIterator(is);
+
+ int numEntries = Integer.parseInt(it.next());
+ //System.out.println(numEntries);
+ String [] result = new String[numEntries];
+
+ while (it.hasNext()) {
+ String line = it.next();
+ if (line.startsWith("#")) {
+ continue;
+ }
+ String[] tokens = TAB_PATTERN.split(line);
+ if (tokens.length < 3) {
+ continue;
+ }
+ int index = Integer.parseInt(tokens[2]);//tokens[1] is the doc freq
+ result[index] = tokens[0];
+ }
+ return result;
+ }
+}
Propchange: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
------------------------------------------------------------------------------
svn:eol-style = native