You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2012/02/13 16:14:19 UTC

svn commit: r1243556 [2/2] - in /mahout/trunk: core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/ core/src/main/java/org...

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Feb 13 15:14:18 2012
@@ -20,22 +20,20 @@ package org.apache.mahout.utils.vectors;
 import com.google.common.base.Charsets;
 import com.google.common.io.Closeables;
 import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
 import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
 import org.apache.commons.cli2.util.HelpFormatter;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
-import org.apache.mahout.common.CommandLineUtil;
+import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
+import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
 import org.apache.mahout.math.NamedVector;
@@ -45,6 +43,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.File;
+import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.util.HashSet;
@@ -56,211 +55,215 @@ import java.util.Set;
  * out the results using {@link Vector#asFormatString()} to either the console or to a
  * file.
  */
-public final class VectorDumper {
+public final class VectorDumper extends AbstractJob {
 
   private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
 
   private VectorDumper() {
   }
 
-  public static void main(String[] args) throws Exception {
-    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
-    ArgumentBuilder abuilder = new ArgumentBuilder();
-    GroupBuilder gbuilder = new GroupBuilder();
-
-    Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
-            abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
-            "The Sequence File containing the Vectors").withShortName("s").create();
-    Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).withDescription(
-            "If the Key is a vector, then dump that instead").withShortName("u").create();
-    Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).withDescription(
-            "Print out the key as well, delimited by a tab (or the value if useKey is true)").withShortName("p")
-            .create();
-    Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
-        abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
-            "The output file.  If not specified, dumps to the console").withShortName("o").create();
-    Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
-        abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
-            "The dictionary file. ").withShortName("d").create();
-    Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
-            abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
-            "The dictionary file type (text|sequencefile)").withShortName("dt").create();
-    Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
-            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries")
-            .withShortName("c").create();
-    Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
-            "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
-            .withShortName("n").create();
-    Option sortVectorsOpt = obuilder.withLongName("sortVectors").withRequired(false).withDescription(
-            "Sort output key/value pairs of the vector entries in abs magnitude descending order")
-            .withShortName("sort").create();
-    Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
-            withDescription("Dump only the size of the vector").withShortName("sz").create();
-    Option numItemsOpt = obuilder.withLongName("numItems").withRequired(false).withArgument(
-        abuilder.withName("n").withMinimum(1).withMaximum(1).create()).
-            withDescription("Output at most <n> vecors").withShortName("n").create();
-    Option numIndexesPerVectorOpt = obuilder.withLongName("vectorSize").withShortName("vs")
-        .withRequired(false).withArgument(abuilder.withName("vs").withMinimum(1)
-                                              .withMaximum(1).create())
-        .withDescription("Truncate vectors to <vs> length when dumping (most useful when in"
-                             + " conjunction with -sort").create();
-    Option filtersOpt = obuilder.withLongName("filter").withRequired(false).withArgument(
-            abuilder.withName("filter").withMinimum(1).withMaximum(100).create()).
-            withDescription("Only dump out those vectors whose name matches the filter." +
-            "  Multiple items may be specified by repeating the argument.").withShortName("fi").create();
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
-            .create();
-
-    Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
-                          .withOption(dictTypeOpt).withOption(dictOpt).withOption(csvOpt)
-                          .withOption(vectorAsKeyOpt).withOption(printKeyOpt).withOption(sortVectorsOpt)
-                          .withOption(filtersOpt).withOption(helpOpt).withOption(numItemsOpt)
-                          .withOption(sizeOpt).withOption(numIndexesPerVectorOpt).create();
+  @Override
+  public int run(String[] args) throws Exception {
+    int result = 0;
+    /**
+     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
+     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
+     "The Sequence File containing the Vectors").withShortName("s").create();
+     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
+     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
+     .withDescription("The directory containing Sequence File of Vectors")
+     .withShortName("d").create();
+     */
+    addInputOption();
+    addOutputOption();
+    addOption("useKey", "u", "If the Key is a vector than dump that instead", false);
+    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false);
+    addOption("dictionary", "d", "The dictionary file.", false);
+    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
+    addOption("csv", "c", "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries", false);
+    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false);
+    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false);
+    addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false);
+    addOption("quiet", "q", "Print only file contents", false);
+    addOption("sizeOnly", "sz", "Dump only the size of the vector", false);
+    addOption("numItems", "ni", "Output at most <n> vecors", false);
+    addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
+            + " conjunction with -sort", false);
+    addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." +
+            "  Multiple items may be specified by repeating the argument.", true, 1, 100, false, null));
 
-    try {
-      Parser parser = new Parser();
-      parser.setGroup(group);
-      CommandLine cmdLine = parser.parse(args);
-
-      if (cmdLine.hasOption(helpOpt)) {
-        CommandLineUtil.printHelpWithGenericOptions(group);
-        return;
-      }
-
-      if (cmdLine.hasOption(seqOpt)) {
-        Configuration conf = new Configuration();
-        Path pathPattern = new Path(cmdLine.getValue(seqOpt).toString());
-        FileSystem fs = FileSystem.get(conf);
-        FileStatus[] inputPaths = fs.globStatus(pathPattern);
-
-        String dictionaryType = "text";
-        if (cmdLine.hasOption(dictTypeOpt)) {
-          dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
-        }
+    if (parseArguments(args) == null) {
+      return -1;
+    }
+
+    Path[] pathArr = null;
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    Path input = getInputPath();
+    FileStatus fileStatus = fs.getFileStatus(input);
+    if (fileStatus.isDir()){
+      pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
+    } else {
+      FileStatus[] inputPaths = fs.globStatus(input);
+      pathArr = new Path[inputPaths.length];
+      int i = 0;
+      for (FileStatus fstatus : inputPaths) {
+        pathArr[i++] = fstatus.getPath();
+      }
+    }
 
-        boolean sortVectors = cmdLine.hasOption(sortVectorsOpt);
-        log.info("Sort? " + sortVectors);
 
-        String[] dictionary = null;
-        if (cmdLine.hasOption(dictOpt)) {
-          if ("text".equals(dictionaryType)) {
-            dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString()));
-          } else if ("sequencefile".equals(dictionaryType)) {
-            dictionary = VectorHelper.loadTermDictionary(conf, cmdLine.getValue(dictOpt).toString());
-          } else {
-            throw new OptionException(dictTypeOpt);
+    String dictionaryType = getOption("dictionaryType", "text");
+
+    boolean sortVectors = hasOption("sortVectors");
+    boolean quiet = hasOption("quiet");
+    if (quiet == false){
+      log.info("Sort? " + sortVectors);
+    }
+
+    String[] dictionary = null;
+    if (hasOption("dictionary")) {
+      String dictFile = getOption("dictionary");
+      if ("text".equals(dictionaryType)) {
+        dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
+      } else if ("sequencefile".equals(dictionaryType)) {
+        dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
+      } else {
+        //TODO: support Lucene's FST as a dictionary type
+        throw new IOException("Invalid dictionary type: " + dictionaryType);
+      }
+    }
+
+    Set<String> filters;
+    if (hasOption("filter")) {
+      filters = new HashSet<String>(getOptions("filter"));
+    } else {
+      filters = null;
+    }
+
+    boolean useCSV = hasOption("csv");
+
+    boolean sizeOnly = hasOption("sizeOnly");
+    boolean nameOnly = hasOption("nameOnly");
+    boolean namesAsComments = hasOption("namesAsComments");
+    boolean transposeKeyValue = hasOption("vectorAsKey");
+    Writer writer;
+    boolean shouldClose;
+    File output = getOutputFile();
+    if (output != null) {
+      shouldClose = true;
+      writer = Files.newWriter(output, Charsets.UTF_8);
+    } else {
+      shouldClose = false;
+      writer = new OutputStreamWriter(System.out);
+    }
+    try {
+      boolean printKey = hasOption("printKey");
+      if (useCSV && dictionary != null) {
+        writer.write("#");
+        for (int j = 0; j < dictionary.length; j++) {
+          writer.write(dictionary[j]);
+          if (j < dictionary.length - 1) {
+            writer.write(',');
           }
         }
-
-        Set<String> filters;
-        if (cmdLine.hasOption(filtersOpt)) {
-          filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
-        } else {
-          filters = null;
+        writer.write('\n');
+      }
+      Long numItems = null;
+      if (hasOption("numItems")) {
+        numItems = Long.parseLong(getOption("numItems"));
+        if (quiet){
+          writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
         }
-        boolean useCSV = cmdLine.hasOption(csvOpt);
-
-        boolean sizeOnly = cmdLine.hasOption(sizeOpt);
-        boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
-        boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
-        Writer writer;
-        boolean shouldClose;
-        if (cmdLine.hasOption(outputOpt)) {
-          shouldClose = true;
-          writer = Files.newWriter(new File(cmdLine.getValue(outputOpt).toString()), Charsets.UTF_8);
-        } else {
-          shouldClose = false;
-          writer = new OutputStreamWriter(System.out);
+      }
+      int maxIndexesPerVector = hasOption("numIndexesPerVector")
+              ? Integer.parseInt(getOption("numIndexesPerVector").toString())
+              : Integer.MAX_VALUE;
+      long itemCount = 0;
+      int fileCount = 0;
+      for (Path path : pathArr) {
+        if (numItems != null && numItems <= itemCount) {
+          break;
         }
-        try {
-          boolean printKey = cmdLine.hasOption(printKeyOpt);
-          if (useCSV && dictionary != null) {
-            writer.write("#");
-            for (int j = 0; j < dictionary.length; j++) {
-              writer.write(dictionary[j]);
-              if (j < dictionary.length - 1) {
-                writer.write(',');
-              }
-            }
-            writer.write('\n');
+        if (quiet) {
+          log.info("Processing file '{}' ({}/{})",
+                  new Object[]{path, ++fileCount, pathArr.length});
+        }
+        SequenceFileIterable<Writable, Writable> iterable =
+                new SequenceFileIterable<Writable, Writable>(path, true, conf);
+        Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
+        long i = 0;
+        while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
+          Pair<Writable, Writable> record = iterator.next();
+          Writable keyWritable = record.getFirst();
+          Writable valueWritable = record.getSecond();
+          if (printKey) {
+            Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
+            writer.write(notTheVectorWritable.toString());
+            writer.write('\t');
           }
-          Long numItems = null;
-          if (cmdLine.hasOption(numItemsOpt)) {
-            numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString());
-            writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
+          Vector vector = null;
+          try {
+            VectorWritable vectorWritable;
+            vector = ((VectorWritable)
+                    (transposeKeyValue ? keyWritable : valueWritable)).get();
+          } catch (ClassCastException e) {
+            if ((transposeKeyValue ? keyWritable : valueWritable)
+                    instanceof WeightedPropertyVectorWritable)
+              vector =
+                      ((WeightedPropertyVectorWritable)
+                              (transposeKeyValue ? keyWritable : valueWritable)).getVector();
+            else
+              throw e;
           }
-          int maxIndexesPerVector = cmdLine.hasOption(numIndexesPerVectorOpt)
-              ? Integer.parseInt(cmdLine.getValue(numIndexesPerVectorOpt).toString())
-              : Integer.MAX_VALUE;
-          long itemCount = 0;
-          int fileCount = 0;
-          for (FileStatus stat : inputPaths) {
-            if (numItems != null && numItems <= itemCount) {
-              break;
-            }
-            Path path = stat.getPath();
-            log.info("Processing file '{}' ({}/{})",
-                new Object[]{path, ++fileCount, inputPaths.length});
-            SequenceFileIterable<Writable, Writable> iterable =
-                new SequenceFileIterable<Writable, Writable>(path, true, conf);
-            Iterator<Pair<Writable,Writable>> iterator = iterable.iterator();
-            long i = 0;
-            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
-              Pair<Writable, Writable> record = iterator.next();
-              Writable keyWritable = record.getFirst();
-              Writable valueWritable = record.getSecond();
-              if (printKey) {
-                Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
-                writer.write(notTheVectorWritable.toString());
-                writer.write('\t');
-              }
-              VectorWritable vectorWritable =
-                  (VectorWritable) (transposeKeyValue ? keyWritable : valueWritable);
-              Vector vector = vectorWritable.get();
-              if (filters != null
+          if (filters != null
                   && vector instanceof NamedVector
-                  && !filters.contains(((NamedVector)vector).getName())){
-                //we are filtering out this item, skip
-                continue;
-              }
-              if (sizeOnly) {
-                if (vector instanceof NamedVector) {
-                  writer.write(((NamedVector) vector).getName());
-                  writer.write(":");
-                } else {
-                  writer.write(String.valueOf(i++));
-                  writer.write(":");
-                }
-                writer.write(String.valueOf(vector.size()));
-                writer.write('\n');
-              } else {
-                String fmtStr;
-                if (useCSV) {
-                  fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
-                } else {
-                  fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
+                  && !filters.contains(((NamedVector) vector).getName())) {
+            //we are filtering out this item, skip
+            continue;
+          }
+          if (sizeOnly) {
+            if (vector instanceof NamedVector) {
+              writer.write(((NamedVector) vector).getName());
+              writer.write(":");
+            } else {
+              writer.write(String.valueOf(i++));
+              writer.write(":");
+            }
+            writer.write(String.valueOf(vector.size()));
+            writer.write('\n');
+          } else if (nameOnly) {
+            if (vector instanceof NamedVector) {
+              writer.write(((NamedVector) vector).getName());
+              writer.write('\n');
+            }
+          } else {
+            String fmtStr;
+            if (useCSV) {
+              fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
+            } else {
+              fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                       sortVectors);
-                }
-                writer.write(fmtStr);
-                writer.write('\n');
-              }
-              itemCount++;
             }
+            writer.write(fmtStr);
+            writer.write('\n');
           }
-          writer.flush();
-        } finally {
-          if (shouldClose) {
-            Closeables.closeQuietly(writer);
-          }
+          itemCount++;
         }
-
       }
-
-    } catch (OptionException e) {
-      log.error("Exception", e);
-      printHelp(group);
+      writer.flush();
+    } finally {
+      if (shouldClose) {
+        Closeables.closeQuietly(writer);
+      }
     }
 
+    return result;
+
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new VectorDumper(), args);
   }
 
   private static void printHelp(Group group) {

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Feb 13 15:14:18 2012
@@ -111,13 +111,22 @@ public final class VectorHelper {
 
   public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries,
       final String[] dictionary) {
-    return Lists.newArrayList(Collections2.transform(entries,
-          new Function<Pair<Integer, Double>, Pair<String, Double>>() {
-            @Override
-            public Pair<String, Double> apply(Pair<Integer, Double> p) {
-              return Pair.of(dictionary[p.getFirst()], p.getSecond());
-            }
-          }));
+    if (dictionary != null) 
+      return Lists.newArrayList(Collections2.transform(entries,
+            new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+              @Override
+              public Pair<String, Double> apply(Pair<Integer, Double> p) {
+                return Pair.of(dictionary[p.getFirst()], p.getSecond());
+              }
+            }));
+    else 
+      return Lists.newArrayList(Collections2.transform(entries,
+            new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+              @Override
+              public Pair<String, Double> apply(Pair<Integer, Double> p) {
+                return Pair.of(Integer.toString(p.getFirst()), p.getSecond());
+              }
+            }));
   }
 
   public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) {