You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2010/01/02 23:45:49 UTC

svn commit: r895306 - in /lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils: clustering/ClusterDumper.java vectors/VectorHelper.java

Author: gsingers
Date: Sat Jan  2 22:45:49 2010
New Revision: 895306

URL: http://svn.apache.org/viewvc?rev=895306&view=rev
Log:
ClusterDumper cleanups plus some printing help

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java   (with props)
Modified:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=895306&r1=895305&r2=895306&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Jan  2 22:45:49 2010
@@ -36,7 +36,7 @@
 import org.apache.mahout.clustering.ClusterBase;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.common.CommandLineUtil;
-import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.utils.vectors.VectorHelper;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -49,20 +49,18 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
-import java.util.regex.Pattern;
 
 public final class ClusterDumper {
 
   private static final Logger log = LoggerFactory.getLogger(ClusterDumper.class);
   private static final String LINE_SEP = System.getProperty("line.separator");
-  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
 
   String seqFileDir;
   String pointsDir;
@@ -70,7 +68,8 @@
   String outputFile;
   int subString = Integer.MAX_VALUE;
   Map<String, List<String>> clusterIdToPoints = null;
-  
+  private boolean useJSON = false;
+
   public ClusterDumper(String seqFileDir, String pointsDir) throws IOException {
     this.seqFileDir = seqFileDir;
     this.pointsDir = pointsDir;
@@ -92,9 +91,9 @@
     JobConf conf = new JobConf(Job.class);
     client.setConf(conf);
     
-    ArrayList<String> dictionary = null;
+    String[] dictionary = null;
     if (this.termDictionary != null) {
-      dictionary = getTermDict(this.termDictionary);
+      dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
     }
     
     Writer writer = null;
@@ -122,7 +121,7 @@
       ClusterBase value = (ClusterBase) reader.getValueClass().newInstance();
       while (reader.next(key, value)){
         Vector center = value.getCenter();
-        String fmtStr = center.asFormatString();
+        String fmtStr = useJSON == false ? VectorHelper.vectorToString(center, dictionary) : center.asFormatString();
         writer.append("Id: ").append(String.valueOf(value.getId())).append(":").append("name:")
                 .append(center.getName()).append(":").append(fmtStr.substring(0, Math.min(subString, fmtStr.length()))).append(LINE_SEP);
         
@@ -192,6 +191,8 @@
     Option substringOpt = obuilder.withLongName("substring").withRequired(false).withArgument(
             abuilder.withName("substring").withMinimum(1).withMaximum(1).create()).
             withDescription("The number of chars of the asFormatString() to print").withShortName("b").create();
+    Option centroidJSonOpt = obuilder.withLongName("json").withRequired(false).
+            withDescription("Output the centroid as JSON.  Otherwise it substitues in the terms for vector cell entries").withShortName("j").create();
     Option pointsOpt = obuilder.withLongName("pointsDir").withRequired(false).withArgument(
             abuilder.withName("pointsDir").withMinimum(1).withMaximum(1).create()).
             withDescription("The directory containing points sequence files mapping input vectors to their cluster.  " +
@@ -202,7 +203,7 @@
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
 
-    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(dictOpt).create();
+    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt).withOption(substringOpt).withOption(pointsOpt).withOption(centroidJSonOpt).withOption(dictOpt).create();
 
     
     try {
@@ -235,9 +236,11 @@
       if (cmdLine.hasOption(substringOpt)) {
         sub = Integer.parseInt(cmdLine.getValue(substringOpt).toString());
       }
-      
       ClusterDumper clusterDumper = new ClusterDumper(seqFileDir, pointsDir);
-      
+      if (cmdLine.hasOption(centroidJSonOpt)) {
+        clusterDumper.setUseJSON(true);
+      }
+
       if (outputFile != null) {
         clusterDumper.setOutputFile(outputFile);
       }
@@ -254,6 +257,10 @@
     }
   }
 
+  private void setUseJSON(boolean json) {
+    this.useJSON = json;
+  }
+
   private static Map<String, List<String>> readPoints(String pointsPathDir, JobConf conf) throws IOException {
     SortedMap<String, List<String>> result = new TreeMap<String, List<String>>();
     
@@ -295,31 +302,7 @@
     return result;
   }
 
-  private static ArrayList<String> getTermDict(String dictFile) throws IOException {
-    FileLineIterator it = new FileLineIterator(new File(dictFile));
 
-    int numEntries = Integer.parseInt(it.next());
-    System.out.println(numEntries);
-    ArrayList<String> result = new ArrayList<String>();
-    
-    for (int i = 0; i < numEntries; i++) {
-      result.add("dummyentry");
-    }
-    
-    while (it.hasNext()) {
-      String line = it.next();
-      if (line.startsWith("#")) {
-        continue;
-      }
-      String[] tokens = TAB_PATTERN.split(line);
-      if (tokens.length < 3) {
-        continue;
-      }
-      int index = Integer.parseInt(tokens[2]);
-      result.set(index, tokens[0]);
-    }    
-    return result;
-  }
 
   static class TermIndexWeight {
     public int index = -1;
@@ -331,7 +314,7 @@
     }    
   }
 
-  private static String getTopFeatures(Vector vector, ArrayList<String> dictionary, int numTerms) {   
+  private static String getTopFeatures(Vector vector, String[] dictionary, int numTerms) {
 
     List<TermIndexWeight> vectorTerms = new ArrayList<TermIndexWeight>();
     
@@ -353,7 +336,7 @@
       
       for (int i = 0; i < vectorTerms.size() && i < numTerms; i++) {
         int index = vectorTerms.get(i).index;
-        String dictTerm = dictionary.get(index);
+        String dictTerm = dictionary[index];
         if (dictTerm == null) {
           log.error("Dictionary entry missing for "+ index);
           continue;

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=895306&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sat Jan  2 22:45:49 2010
@@ -0,0 +1,104 @@
+package org.apache.mahout.utils.vectors;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.mahout.common.FileLineIterator;
+import org.apache.mahout.math.Vector;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+
+/**
+ *
+ *
+ **/
+public class VectorHelper {
+  private static final Pattern TAB_PATTERN = Pattern.compile("\t");
+
+
+  /**
+   * Create a String from a vector that fills in the values with the appropriate value from a dictionary where each the ith entry is the term for the ith vector cell..
+   * @param vector
+   * @param dictionary The dictionary.  See
+   * @return The String
+   */
+  public static String vectorToString(Vector vector, String [] dictionary){
+    StringBuilder bldr = new StringBuilder(2048);
+    String name = vector.getName();
+    if (name != null && name.equals("") == false) {
+      bldr.append("Name: ").append(name).append(' ');
+    }
+    bldr.append("elts: {");
+    Iterator<Vector.Element> iter = vector.iterateNonZero();
+    boolean first = true;
+    while (iter.hasNext()) {
+      if (first == true){
+        first = false;
+      } else {
+        bldr.append(", ");
+      }
+      Vector.Element elt = (Vector.Element) iter.next();
+      bldr.append(elt.index()).append(':').append(dictionary[elt.index()]);
+
+    }
+    return bldr.toString();
+  }
+
+
+  /**
+   * Read in a dictionary file.  Format is:
+   * <pre>term DocFreq Index</pre>
+   * @param dictFile
+   * @return
+   * @throws IOException
+   */
+  public static String [] loadTermDictionary(File dictFile) throws IOException {
+    return loadTermDictionary(new FileInputStream(dictFile));
+  }
+
+  /**
+   * Read in a dictionary file.  Format is:
+   * First line is the number of entries
+   * <pre>term DocFreq Index</pre>
+   */
+  public static String [] loadTermDictionary(InputStream is) throws IOException {
+    FileLineIterator it = new FileLineIterator(is);
+
+    int numEntries = Integer.parseInt(it.next());
+    //System.out.println(numEntries);
+    String [] result = new String[numEntries];
+
+    while (it.hasNext()) {
+      String line = it.next();
+      if (line.startsWith("#")) {
+        continue;
+      }
+      String[] tokens = TAB_PATTERN.split(line);
+      if (tokens.length < 3) {
+        continue;
+      }
+      int index = Integer.parseInt(tokens[2]);//tokens[1] is the doc freq
+      result[index] = tokens[0];
+    }
+    return result;
+  }
+}

Propchange: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
------------------------------------------------------------------------------
    svn:eol-style = native