You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2012/02/13 16:14:19 UTC
svn commit: r1243556 [2/2] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/
core/src/main/java/org...
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Feb 13 15:14:18 2012
@@ -20,22 +20,20 @@ package org.apache.mahout.utils.vectors;
import com.google.common.base.Charsets;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
-import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
-import org.apache.mahout.common.CommandLineUtil;
+import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.clustering.WeightedPropertyVectorWritable;
+import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.NamedVector;
@@ -45,6 +43,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
+import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashSet;
@@ -56,211 +55,215 @@ import java.util.Set;
* out the results using {@link Vector#asFormatString()} to either the console or to a
* file.
*/
-public final class VectorDumper {
+public final class VectorDumper extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(VectorDumper.class);
private VectorDumper() {
}
- public static void main(String[] args) throws Exception {
- DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
- ArgumentBuilder abuilder = new ArgumentBuilder();
- GroupBuilder gbuilder = new GroupBuilder();
-
- Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
- abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
- "The Sequence File containing the Vectors").withShortName("s").create();
- Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).withDescription(
- "If the Key is a vector, then dump that instead").withShortName("u").create();
- Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).withDescription(
- "Print out the key as well, delimited by a tab (or the value if useKey is true)").withShortName("p")
- .create();
- Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription(
- "The output file. If not specified, dumps to the console").withShortName("o").create();
- Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument(
- abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file. ").withShortName("d").create();
- Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument(
- abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription(
- "The dictionary file type (text|sequencefile)").withShortName("dt").create();
- Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription(
- "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries")
- .withShortName("c").create();
- Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription(
- "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name")
- .withShortName("n").create();
- Option sortVectorsOpt = obuilder.withLongName("sortVectors").withRequired(false).withDescription(
- "Sort output key/value pairs of the vector entries in abs magnitude descending order")
- .withShortName("sort").create();
- Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false).
- withDescription("Dump only the size of the vector").withShortName("sz").create();
- Option numItemsOpt = obuilder.withLongName("numItems").withRequired(false).withArgument(
- abuilder.withName("n").withMinimum(1).withMaximum(1).create()).
- withDescription("Output at most <n> vecors").withShortName("n").create();
- Option numIndexesPerVectorOpt = obuilder.withLongName("vectorSize").withShortName("vs")
- .withRequired(false).withArgument(abuilder.withName("vs").withMinimum(1)
- .withMaximum(1).create())
- .withDescription("Truncate vectors to <vs> length when dumping (most useful when in"
- + " conjunction with -sort").create();
- Option filtersOpt = obuilder.withLongName("filter").withRequired(false).withArgument(
- abuilder.withName("filter").withMinimum(1).withMaximum(100).create()).
- withDescription("Only dump out those vectors whose name matches the filter." +
- " Multiple items may be specified by repeating the argument.").withShortName("fi").create();
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
- .create();
-
- Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt)
- .withOption(dictTypeOpt).withOption(dictOpt).withOption(csvOpt)
- .withOption(vectorAsKeyOpt).withOption(printKeyOpt).withOption(sortVectorsOpt)
- .withOption(filtersOpt).withOption(helpOpt).withOption(numItemsOpt)
- .withOption(sizeOpt).withOption(numIndexesPerVectorOpt).create();
+ @Override
+ public int run(String[] args) throws Exception {
+ int result = 0;
+ /**
+ Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
+ abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
+ "The Sequence File containing the Vectors").withShortName("s").create();
+ Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
+ abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
+ .withDescription("The directory containing Sequence File of Vectors")
+ .withShortName("d").create();
+ */
+ addInputOption();
+ addOutputOption();
+ addOption("useKey", "u", "If the Key is a vector than dump that instead", false);
+ addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false);
+ addOption("dictionary", "d", "The dictionary file.", false);
+ addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
+ addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries", false);
+ addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false);
+ addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false);
+ addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false);
+ addOption("quiet", "q", "Print only file contents", false);
+ addOption("sizeOnly", "sz", "Dump only the size of the vector", false);
+ addOption("numItems", "ni", "Output at most <n> vecors", false);
+ addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in"
+ + " conjunction with -sort", false);
+ addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." +
+ " Multiple items may be specified by repeating the argument.", true, 1, 100, false, null));
- try {
- Parser parser = new Parser();
- parser.setGroup(group);
- CommandLine cmdLine = parser.parse(args);
-
- if (cmdLine.hasOption(helpOpt)) {
- CommandLineUtil.printHelpWithGenericOptions(group);
- return;
- }
-
- if (cmdLine.hasOption(seqOpt)) {
- Configuration conf = new Configuration();
- Path pathPattern = new Path(cmdLine.getValue(seqOpt).toString());
- FileSystem fs = FileSystem.get(conf);
- FileStatus[] inputPaths = fs.globStatus(pathPattern);
-
- String dictionaryType = "text";
- if (cmdLine.hasOption(dictTypeOpt)) {
- dictionaryType = cmdLine.getValue(dictTypeOpt).toString();
- }
+ if (parseArguments(args) == null) {
+ return -1;
+ }
+
+ Path[] pathArr = null;
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ Path input = getInputPath();
+ FileStatus fileStatus = fs.getFileStatus(input);
+ if (fileStatus.isDir()){
+ pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter()));
+ } else {
+ FileStatus[] inputPaths = fs.globStatus(input);
+ pathArr = new Path[inputPaths.length];
+ int i = 0;
+ for (FileStatus fstatus : inputPaths) {
+ pathArr[i++] = fstatus.getPath();
+ }
+ }
- boolean sortVectors = cmdLine.hasOption(sortVectorsOpt);
- log.info("Sort? " + sortVectors);
- String[] dictionary = null;
- if (cmdLine.hasOption(dictOpt)) {
- if ("text".equals(dictionaryType)) {
- dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString()));
- } else if ("sequencefile".equals(dictionaryType)) {
- dictionary = VectorHelper.loadTermDictionary(conf, cmdLine.getValue(dictOpt).toString());
- } else {
- throw new OptionException(dictTypeOpt);
+ String dictionaryType = getOption("dictionaryType", "text");
+
+ boolean sortVectors = hasOption("sortVectors");
+ boolean quiet = hasOption("quiet");
+ if (quiet == false){
+ log.info("Sort? " + sortVectors);
+ }
+
+ String[] dictionary = null;
+ if (hasOption("dictionary")) {
+ String dictFile = getOption("dictionary");
+ if ("text".equals(dictionaryType)) {
+ dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
+ } else if ("sequencefile".equals(dictionaryType)) {
+ dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
+ } else {
+ //TODO: support Lucene's FST as a dictionary type
+ throw new IOException("Invalid dictionary type: " + dictionaryType);
+ }
+ }
+
+ Set<String> filters;
+ if (hasOption("filter")) {
+ filters = new HashSet<String>(getOptions("filter"));
+ } else {
+ filters = null;
+ }
+
+ boolean useCSV = hasOption("csv");
+
+ boolean sizeOnly = hasOption("sizeOnly");
+ boolean nameOnly = hasOption("nameOnly");
+ boolean namesAsComments = hasOption("namesAsComments");
+ boolean transposeKeyValue = hasOption("vectorAsKey");
+ Writer writer;
+ boolean shouldClose;
+ File output = getOutputFile();
+ if (output != null) {
+ shouldClose = true;
+ writer = Files.newWriter(output, Charsets.UTF_8);
+ } else {
+ shouldClose = false;
+ writer = new OutputStreamWriter(System.out);
+ }
+ try {
+ boolean printKey = hasOption("printKey");
+ if (useCSV && dictionary != null) {
+ writer.write("#");
+ for (int j = 0; j < dictionary.length; j++) {
+ writer.write(dictionary[j]);
+ if (j < dictionary.length - 1) {
+ writer.write(',');
}
}
-
- Set<String> filters;
- if (cmdLine.hasOption(filtersOpt)) {
- filters = new HashSet<String>(cmdLine.getValues(filtersOpt));
- } else {
- filters = null;
+ writer.write('\n');
+ }
+ Long numItems = null;
+ if (hasOption("numItems")) {
+ numItems = Long.parseLong(getOption("numItems"));
+ if (quiet){
+ writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
}
- boolean useCSV = cmdLine.hasOption(csvOpt);
-
- boolean sizeOnly = cmdLine.hasOption(sizeOpt);
- boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt);
- boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt);
- Writer writer;
- boolean shouldClose;
- if (cmdLine.hasOption(outputOpt)) {
- shouldClose = true;
- writer = Files.newWriter(new File(cmdLine.getValue(outputOpt).toString()), Charsets.UTF_8);
- } else {
- shouldClose = false;
- writer = new OutputStreamWriter(System.out);
+ }
+ int maxIndexesPerVector = hasOption("numIndexesPerVector")
+ ? Integer.parseInt(getOption("numIndexesPerVector").toString())
+ : Integer.MAX_VALUE;
+ long itemCount = 0;
+ int fileCount = 0;
+ for (Path path : pathArr) {
+ if (numItems != null && numItems <= itemCount) {
+ break;
}
- try {
- boolean printKey = cmdLine.hasOption(printKeyOpt);
- if (useCSV && dictionary != null) {
- writer.write("#");
- for (int j = 0; j < dictionary.length; j++) {
- writer.write(dictionary[j]);
- if (j < dictionary.length - 1) {
- writer.write(',');
- }
- }
- writer.write('\n');
+ if (quiet) {
+ log.info("Processing file '{}' ({}/{})",
+ new Object[]{path, ++fileCount, pathArr.length});
+ }
+ SequenceFileIterable<Writable, Writable> iterable =
+ new SequenceFileIterable<Writable, Writable>(path, true, conf);
+ Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
+ long i = 0;
+ while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
+ Pair<Writable, Writable> record = iterator.next();
+ Writable keyWritable = record.getFirst();
+ Writable valueWritable = record.getSecond();
+ if (printKey) {
+ Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
+ writer.write(notTheVectorWritable.toString());
+ writer.write('\t');
}
- Long numItems = null;
- if (cmdLine.hasOption(numItemsOpt)) {
- numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString());
- writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
+ Vector vector = null;
+ try {
+ VectorWritable vectorWritable;
+ vector = ((VectorWritable)
+ (transposeKeyValue ? keyWritable : valueWritable)).get();
+ } catch (ClassCastException e) {
+ if ((transposeKeyValue ? keyWritable : valueWritable)
+ instanceof WeightedPropertyVectorWritable)
+ vector =
+ ((WeightedPropertyVectorWritable)
+ (transposeKeyValue ? keyWritable : valueWritable)).getVector();
+ else
+ throw e;
}
- int maxIndexesPerVector = cmdLine.hasOption(numIndexesPerVectorOpt)
- ? Integer.parseInt(cmdLine.getValue(numIndexesPerVectorOpt).toString())
- : Integer.MAX_VALUE;
- long itemCount = 0;
- int fileCount = 0;
- for (FileStatus stat : inputPaths) {
- if (numItems != null && numItems <= itemCount) {
- break;
- }
- Path path = stat.getPath();
- log.info("Processing file '{}' ({}/{})",
- new Object[]{path, ++fileCount, inputPaths.length});
- SequenceFileIterable<Writable, Writable> iterable =
- new SequenceFileIterable<Writable, Writable>(path, true, conf);
- Iterator<Pair<Writable,Writable>> iterator = iterable.iterator();
- long i = 0;
- while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
- Pair<Writable, Writable> record = iterator.next();
- Writable keyWritable = record.getFirst();
- Writable valueWritable = record.getSecond();
- if (printKey) {
- Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
- writer.write(notTheVectorWritable.toString());
- writer.write('\t');
- }
- VectorWritable vectorWritable =
- (VectorWritable) (transposeKeyValue ? keyWritable : valueWritable);
- Vector vector = vectorWritable.get();
- if (filters != null
+ if (filters != null
&& vector instanceof NamedVector
- && !filters.contains(((NamedVector)vector).getName())){
- //we are filtering out this item, skip
- continue;
- }
- if (sizeOnly) {
- if (vector instanceof NamedVector) {
- writer.write(((NamedVector) vector).getName());
- writer.write(":");
- } else {
- writer.write(String.valueOf(i++));
- writer.write(":");
- }
- writer.write(String.valueOf(vector.size()));
- writer.write('\n');
- } else {
- String fmtStr;
- if (useCSV) {
- fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
- } else {
- fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
+ && !filters.contains(((NamedVector) vector).getName())) {
+ //we are filtering out this item, skip
+ continue;
+ }
+ if (sizeOnly) {
+ if (vector instanceof NamedVector) {
+ writer.write(((NamedVector) vector).getName());
+ writer.write(":");
+ } else {
+ writer.write(String.valueOf(i++));
+ writer.write(":");
+ }
+ writer.write(String.valueOf(vector.size()));
+ writer.write('\n');
+ } else if (nameOnly) {
+ if (vector instanceof NamedVector) {
+ writer.write(((NamedVector) vector).getName());
+ writer.write('\n');
+ }
+ } else {
+ String fmtStr;
+ if (useCSV) {
+ fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
+ } else {
+ fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
sortVectors);
- }
- writer.write(fmtStr);
- writer.write('\n');
- }
- itemCount++;
}
+ writer.write(fmtStr);
+ writer.write('\n');
}
- writer.flush();
- } finally {
- if (shouldClose) {
- Closeables.closeQuietly(writer);
- }
+ itemCount++;
}
-
}
-
- } catch (OptionException e) {
- log.error("Exception", e);
- printHelp(group);
+ writer.flush();
+ } finally {
+ if (shouldClose) {
+ Closeables.closeQuietly(writer);
+ }
}
+ return result;
+
+ }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new Configuration(), new VectorDumper(), args);
}
private static void printHelp(Group group) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Feb 13 15:14:18 2012
@@ -111,13 +111,22 @@ public final class VectorHelper {
public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries,
final String[] dictionary) {
- return Lists.newArrayList(Collections2.transform(entries,
- new Function<Pair<Integer, Double>, Pair<String, Double>>() {
- @Override
- public Pair<String, Double> apply(Pair<Integer, Double> p) {
- return Pair.of(dictionary[p.getFirst()], p.getSecond());
- }
- }));
+ if (dictionary != null)
+ return Lists.newArrayList(Collections2.transform(entries,
+ new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+ @Override
+ public Pair<String, Double> apply(Pair<Integer, Double> p) {
+ return Pair.of(dictionary[p.getFirst()], p.getSecond());
+ }
+ }));
+ else
+ return Lists.newArrayList(Collections2.transform(entries,
+ new Function<Pair<Integer, Double>, Pair<String, Double>>() {
+ @Override
+ public Pair<String, Double> apply(Pair<Integer, Double> p) {
+ return Pair.of(Integer.toString(p.getFirst()), p.getSecond());
+ }
+ }));
}
public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) {