You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2015/04/23 05:03:46 UTC

mahout git commit: MAHOUT-1690:CLONE - Some vector dumper flags are expecting arguments. This closes apache/mahout#122

Repository: mahout
Updated Branches:
  refs/heads/master 6a3f93e6f -> a3f78bde9


MAHOUT-1690:CLONE - Some vector dumper flags are expecting arguments. This closes apache/mahout#122


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/a3f78bde
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/a3f78bde
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/a3f78bde

Branch: refs/heads/master
Commit: a3f78bde9bf87d3d37931f878015b490761e75ce
Parents: 6a3f93e
Author: Suneel Marthi <su...@gmail.com>
Authored: Wed Apr 22 23:04:56 2015 -0400
Committer: Suneel Marthi <su...@gmail.com>
Committed: Wed Apr 22 23:04:56 2015 -0400

----------------------------------------------------------------------
 CHANGELOG                                       |   4 +-
 .../mahout/utils/vectors/VectorDumper.java      | 114 ++++++++++---------
 2 files changed, 61 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/a3f78bde/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 5611588..52799ba 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,9 +2,11 @@ Mahout Change Log
 
 Release 0.11.0 - unreleased
 
+  MAHOUT-1690: CLONE - Some vector dumper flags are expecting arguments. (smarthi)
+
   MAHOUT-1693: FunctionalMatrixView materializes row vectors in scala shell (apalumbo)
 
-  MAHOUT-1680: Renamed mahout-distribution to apache-mahout-distribution
+  MAHOUT-1680: Renamed mahout-distribution to apache-mahout-distribution (sslavic)
 
 Release 0.10.0 - 2015-04-11
 

http://git-wip-us.apache.org/repos/asf/mahout/blob/a3f78bde/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
index 93ad0d5..e1c3fbc 100644
--- a/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
+++ b/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
@@ -5,9 +5,9 @@
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,14 +17,7 @@
 
 package org.apache.mahout.utils.vectors;
 
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
+import com.google.common.collect.Sets;
 import com.google.common.io.Closeables;
 import com.google.common.io.Files;
 import org.apache.commons.io.Charsets;
@@ -46,6 +39,13 @@ import org.apache.mahout.math.VectorWritable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.Iterator;
+import java.util.Set;
+
 /**
  * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
  * out the results using {@link Vector#asFormatString()} to either the console or to a
@@ -55,7 +55,8 @@ public final class VectorDumper extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
 
-  private VectorDumper() {}
+  private VectorDumper() {
+  }
 
   @Override
   public int run(String[] args) throws Exception {
@@ -84,9 +85,9 @@ public final class VectorDumper extends AbstractJob {
     addOption("sizeOnly", "sz", "Dump only the size of the vector");
     addOption("numItems", "ni", "Output at most <n> vecors", false);
     addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
-            + " conjunction with -sort", false);
-    addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." 
-            + "  Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));
+        + " conjunction with -sort", false);
+    addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter."
+        + "  Multiple items may be specified by repeating the argument.", true, 1, Integer.MAX_VALUE, false, null));
 
     if (parseArguments(args, false, true) == null) {
       return -1;
@@ -120,19 +121,22 @@ public final class VectorDumper extends AbstractJob {
     String[] dictionary = null;
     if (hasOption("dictionary")) {
       String dictFile = getOption("dictionary");
-      if ("text".equals(dictionaryType)) {
-        dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
-      } else if ("sequencefile".equals(dictionaryType)) {
-        dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
-      } else {
-        //TODO: support Lucene's FST as a dictionary type
-        throw new IOException("Invalid dictionary type: " + dictionaryType);
+      switch (dictionaryType) {
+        case "text":
+          dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
+          break;
+        case "sequencefile":
+          dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
+          break;
+        default:
+          //TODO: support Lucene's FST as a dictionary type
+          throw new IOException("Invalid dictionary type: " + dictionaryType);
       }
     }
 
     Set<String> filters;
     if (hasOption("filter")) {
-      filters = new HashSet<>(getOptions("filter"));
+      filters = Sets.newHashSet(getOptions("filter"));
     } else {
       filters = null;
     }
@@ -175,8 +179,8 @@ public final class VectorDumper extends AbstractJob {
         }
       }
       int maxIndexesPerVector = hasOption("vectorSize")
-              ? Integer.parseInt(getOption("vectorSize"))
-              : Integer.MAX_VALUE;
+          ? Integer.parseInt(getOption("vectorSize"))
+          : Integer.MAX_VALUE;
       long itemCount = 0;
       int fileCount = 0;
       for (Path path : pathArr) {
@@ -201,10 +205,10 @@ public final class VectorDumper extends AbstractJob {
           Vector vector;
           try {
             vector = ((VectorWritable)
-                    (transposeKeyValue ? keyWritable : valueWritable)).get();
+                (transposeKeyValue ? keyWritable : valueWritable)).get();
           } catch (ClassCastException e) {
             if ((transposeKeyValue ? keyWritable : valueWritable)
-                    instanceof WeightedPropertyVectorWritable) {
+                instanceof WeightedPropertyVectorWritable) {
               vector =
                   ((WeightedPropertyVectorWritable)
                       (transposeKeyValue ? keyWritable : valueWritable)).getVector();
@@ -212,39 +216,37 @@ public final class VectorDumper extends AbstractJob {
               throw e;
             }
           }
-          if (filters != null
-                  && vector instanceof NamedVector
-                  && !filters.contains(((NamedVector) vector).getName())) {
-            //we are filtering out this item, skip
-            continue;
-          }
-          if (sizeOnly) {
-            if (vector instanceof NamedVector) {
-              writer.write(((NamedVector) vector).getName());
-              writer.write(":");
-            } else {
-              writer.write(String.valueOf(i++));
-              writer.write(":");
-            }
-            writer.write(String.valueOf(vector.size()));
-            writer.write('\n');
-          } else if (nameOnly) {
-            if (vector instanceof NamedVector) {
-              writer.write(((NamedVector) vector).getName());
+          if (filters == null
+              || !(vector instanceof NamedVector)
+              || filters.contains(((NamedVector) vector).getName())) {
+            if (sizeOnly) {
+              if (vector instanceof NamedVector) {
+                writer.write(((NamedVector) vector).getName());
+                writer.write(":");
+              } else {
+                writer.write(String.valueOf(i++));
+                writer.write(":");
+              }
+              writer.write(String.valueOf(vector.size()));
               writer.write('\n');
-            }
-          } else {
-            String fmtStr;
-            if (useCSV) {
-              fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
+            } else if (nameOnly) {
+              if (vector instanceof NamedVector) {
+                writer.write(((NamedVector) vector).getName());
+                writer.write('\n');
+              }
             } else {
-              fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
-                      sortVectors);
+              String fmtStr;
+              if (useCSV) {
+                fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
+              } else {
+                fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
+                    sortVectors);
+              }
+              writer.write(fmtStr);
+              writer.write('\n');
             }
-            writer.write(fmtStr);
-            writer.write('\n');
+            itemCount++;
           }
-          itemCount++;
         }
       }
       writer.flush();