You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2012/02/13 16:14:19 UTC
svn commit: r1243556 [1/2] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/
core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/
core/src/main/java/org...
Author: gsingers
Date: Mon Feb 13 15:14:18 2012
New Revision: 1243556
URL: http://svn.apache.org/viewvc?rev=1243556&view=rev
Log:
MAHOUT-947: add new inputs to seq dumper, refactor to common CLI input
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/graph/AdjacencyMatrixJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalk.java
mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalkWithRestartJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/solver/DistributedConjugateGradientSolver.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java
mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/DatasetSplitter.java Mon Feb 13 15:14:18 2012
@@ -32,6 +32,7 @@ import org.apache.mahout.common.Abstract
import org.apache.mahout.common.RandomUtils;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
import java.util.Random;
@@ -73,10 +74,10 @@ public class DatasetSplitter extends Abs
addOption("probePercentage", "p", "percentage of the data to use as probe set (default: " +
DEFAULT_PROBE_PERCENTAGE + ')', String.valueOf(DEFAULT_PROBE_PERCENTAGE));
- Map<String, String> parsedArgs = parseArguments(args);
- double trainingPercentage = Double.parseDouble(parsedArgs.get("--trainingPercentage"));
- double probePercentage = Double.parseDouble(parsedArgs.get("--probePercentage"));
- String tempDir = parsedArgs.get("--tempDir");
+ Map<String, List<String>> parsedArgs = parseArguments(args);
+ double trainingPercentage = Double.parseDouble(getOption("trainingPercentage"));
+ double probePercentage = Double.parseDouble(getOption("probePercentage"));
+ String tempDir = getOption("tempDir");
Path markedPrefs = new Path(tempDir, "markedPreferences");
Path trainingSetPath = new Path(getOutputPath(), "trainingSet");
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/FactorizationEvaluator.java Mon Feb 13 15:14:18 2012
@@ -44,6 +44,7 @@ import org.apache.mahout.math.map.OpenIn
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.util.List;
import java.util.Map;
/**
@@ -75,7 +76,7 @@ public class FactorizationEvaluator exte
addOption("itemFeatures", null, "path to the item feature matrix", true);
addOutputOption();
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
@@ -84,10 +85,11 @@ public class FactorizationEvaluator exte
Job predictRatings = prepareJob(getInputPath(), errors, TextInputFormat.class, PredictRatingsMapper.class,
DoubleWritable.class, NullWritable.class, SequenceFileOutputFormat.class);
- predictRatings.getConfiguration().set(USER_FEATURES_PATH, parsedArgs.get("--userFeatures"));
- predictRatings.getConfiguration().set(ITEM_FEATURES_PATH, parsedArgs.get("--itemFeatures"));
+
+ predictRatings.getConfiguration().set(USER_FEATURES_PATH, getOption("userFeatures"));
+ predictRatings.getConfiguration().set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
boolean succeeded = predictRatings.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded)
return -1;
BufferedWriter writer = null;
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ParallelALSFactorizationJob.java Mon Feb 13 15:14:18 2012
@@ -109,16 +109,16 @@ public class ParallelALSFactorizationJob
addOption("numFeatures", null, "dimension of the feature space", true);
addOption("numIterations", null, "number of iterations", true);
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
- numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
- lambda = Double.parseDouble(parsedArgs.get("--lambda"));
- alpha = Double.parseDouble(parsedArgs.get("--alpha"));
- implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
+ numFeatures = Integer.parseInt(getOption("numFeatures"));
+ numIterations = Integer.parseInt(getOption("numIterations"));
+ lambda = Double.parseDouble(getOption("lambda"));
+ alpha = Double.parseDouble(getOption("alpha"));
+ implicitFeedback = Boolean.parseBoolean(getOption("implicitFeedback"));
/*
* compute the factorization A = U M'
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/RecommenderJob.java Mon Feb 13 15:14:18 2012
@@ -80,7 +80,7 @@ public class RecommenderJob extends Abst
addOption("maxRating", null, "maximum rating available", true);
addOutputOption();
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
@@ -88,14 +88,15 @@ public class RecommenderJob extends Abst
Job prediction = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, PredictionMapper.class,
IntWritable.class, RecommendedItemsWritable.class, TextOutputFormat.class);
prediction.getConfiguration().setInt(NUM_RECOMMENDATIONS,
- Integer.parseInt(parsedArgs.get("--numRecommendations")));
- prediction.getConfiguration().set(USER_FEATURES_PATH, parsedArgs.get("--userFeatures"));
- prediction.getConfiguration().set(ITEM_FEATURES_PATH, parsedArgs.get("--itemFeatures"));
- prediction.getConfiguration().set(MAX_RATING, parsedArgs.get("--maxRating"));
+ Integer.parseInt(getOption("numRecommendations")));
+ prediction.getConfiguration().set(USER_FEATURES_PATH, getOption("userFeatures"));
+ prediction.getConfiguration().set(ITEM_FEATURES_PATH, getOption("itemFeatures"));
+ prediction.getConfiguration().set(MAX_RATING, getOption("maxRating"));
boolean succeeded = prediction.waitForCompletion(true);
- if (!succeeded)
+ if (!succeeded)
return -1;
+
return 0;
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java Mon Feb 13 15:14:18 2012
@@ -38,6 +38,7 @@ import org.apache.mahout.math.VarLongWri
import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
@@ -118,24 +119,24 @@ public final class RecommenderJob extend
"alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true);
addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path outputPath = getOutputPath();
- int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations"));
- String usersFile = parsedArgs.get("--usersFile");
- String itemsFile = parsedArgs.get("--itemsFile");
- String filterFile = parsedArgs.get("--filterFile");
- boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
- int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
- int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
- int maxPrefsPerUserInItemSimilarity = Integer.parseInt(parsedArgs.get("--maxPrefsPerUserInItemSimilarity"));
- int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
- String similarityClassname = parsedArgs.get("--similarityClassname");
- double threshold = parsedArgs.containsKey("--threshold") ?
- Double.parseDouble(parsedArgs.get("--threshold")) : RowSimilarityJob.NO_THRESHOLD;
+ int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
+ String usersFile = getOption("usersFile");
+ String itemsFile = getOption("itemsFile");
+ String filterFile = getOption("filterFile");
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+ int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ int maxPrefsPerUserInItemSimilarity = Integer.parseInt(getOption("maxPrefsPerUserInItemSimilarity"));
+ int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+ String similarityClassname = getOption("similarityClassname");
+ double threshold = parsedArgs.containsKey("threshold") ?
+ Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
Path prepPath = getTempPath("preparePreferenceMatrix");
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/preparation/PreparePreferenceMatrixJob.java Mon Feb 13 15:14:18 2012
@@ -35,6 +35,7 @@ import org.apache.mahout.math.VarIntWrit
import org.apache.mahout.math.VarLongWritable;
import org.apache.mahout.math.VectorWritable;
+import java.util.List;
import java.util.Map;
public class PreparePreferenceMatrixJob extends AbstractJob {
@@ -62,14 +63,14 @@ public class PreparePreferenceMatrixJob
addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
addOption("ratingShift", "rs", "shift ratings by this value", "0.0");
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
- boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
- float ratingShift = Float.parseFloat(parsedArgs.get("--ratingShift"));
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
+ float ratingShift = Float.parseFloat(getOption("ratingShift"));
//convert items to an internal index
Job itemIDIndex = prepareJob(getInputPath(), getOutputPath(ITEMID_INDEX), TextInputFormat.class,
ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class,
@@ -100,8 +101,8 @@ public class PreparePreferenceMatrixJob
toItemVectors.setCombinerClass(ToItemVectorsReducer.class);
/* configure sampling regarding the uservectors */
- if (parsedArgs.containsKey("--maxPrefsPerUser")) {
- int samplingSize = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
+ if (hasOption("maxPrefsPerUser")) {
+ int samplingSize = Integer.parseInt(getOption("maxPrefsPerUser"));
toItemVectors.getConfiguration().setInt(ToItemVectorsMapper.SAMPLE_SIZE, samplingSize);
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/pseudo/RecommenderJob.java Mon Feb 13 15:14:18 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.cf.taste.hadoop.pseudo;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
@@ -111,17 +112,17 @@ public final class RecommenderJob extend
addOption("numRecommendations", "n", "Number of recommendations per user", "10");
addOption("usersFile", "u", "File of users to recommend for", null);
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path inputFile = getInputPath();
Path outputPath = getOutputPath();
- Path usersFile = parsedArgs.get("--usersFile") == null ? inputFile : new Path(parsedArgs.get("--usersFile"));
+ Path usersFile = hasOption("usersFile") ? inputFile : new Path(getOption("usersFile"));
- String recommendClassName = parsedArgs.get("--recommenderClassName");
- int recommendationsPerUser = Integer.parseInt(parsedArgs.get("--numRecommendations"));
+ String recommendClassName = getOption("recommenderClassName");
+ int recommendationsPerUser = Integer.parseInt(getOption("numRecommendations"));
Job job = prepareJob(usersFile,
outputPath,
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Mon Feb 13 15:14:18 2012
@@ -19,6 +19,7 @@ package org.apache.mahout.cf.taste.hadoo
import java.io.IOException;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
@@ -108,19 +109,19 @@ public final class ItemSimilarityJob ext
addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- String similarityClassName = parsedArgs.get("--similarityClassname");
- int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
- int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
- int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
- boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
+ String similarityClassName = getOption("similarityClassname");
+ int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
+ int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
+ int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
+ boolean booleanData = Boolean.valueOf(getOption("booleanData"));
- double threshold = parsedArgs.containsKey("--threshold") ?
- Double.parseDouble(parsedArgs.get("--threshold")) : RowSimilarityJob.NO_THRESHOLD;
+ double threshold = hasOption("threshold") ?
+ Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD;
Path similarityMatrixPath = getTempPath("similarityMatrix");
Path prepPath = getTempPath("prepareRatingMatrix");
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/slopeone/SlopeOneAverageDiffsJob.java Mon Feb 13 15:14:18 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.cf.taste.hadoop.slopeone;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
@@ -46,14 +47,14 @@ public final class SlopeOneAverageDiffsJ
addInputOption();
addOutputOption();
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path prefsFile = getInputPath();
Path outputPath = getOutputPath();
- Path averagesOutputPath = new Path(parsedArgs.get("--tempDir"));
+ Path averagesOutputPath = new Path(getOption("--tempDir"));
AtomicInteger currentPhase = new AtomicInteger();
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/test/TestNaiveBayesDriver.java Mon Feb 13 15:14:18 2012
@@ -39,6 +39,7 @@ import org.apache.mahout.math.VectorWrit
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.List;
import java.util.Map;
/**
@@ -64,27 +65,27 @@ public class TestNaiveBayesDriver extend
addOption("model", "m", "The path to the model built during training", true);
addOption(buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
addOption("labelIndex", "l", "The path to the location of the label index", true);
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), getOutputPath());
}
- Path model = new Path(parsedArgs.get("--model"));
+ Path model = new Path(getOption("model"));
HadoopUtil.cacheFiles(model, getConf());
//the output key is the expected value, the output value are the scores for all the labels
Job testJob = prepareJob(getInputPath(), getOutputPath(), SequenceFileInputFormat.class, BayesTestMapper.class,
Text.class, VectorWritable.class, SequenceFileOutputFormat.class);
- //testJob.getConfiguration().set(LABEL_KEY, parsedArgs.get("--labels"));
- boolean complementary = parsedArgs.containsKey("--testComplementary");
+ //testJob.getConfiguration().set(LABEL_KEY, getOption("--labels"));
+ boolean complementary = parsedArgs.containsKey("testComplementary");
testJob.getConfiguration().set(COMPLEMENTARY, String.valueOf(complementary));
boolean succeeded = testJob.waitForCompletion(true);
if (!succeeded) {
return -1;
}
//load the labels
- Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(parsedArgs.get("--labelIndex")));
+ Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));
//loop over the results and create the confusion matrix
SequenceFileDirIterable<Text, VectorWritable> dirIterable =
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/naivebayes/training/TrainNaiveBayesJob.java Mon Feb 13 15:14:18 2012
@@ -38,6 +38,7 @@ import org.apache.mahout.common.mapreduc
import org.apache.mahout.math.VectorWritable;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
/**
@@ -69,7 +70,7 @@ public final class TrainNaiveBayesJob ex
addOption(buildOption("trainComplementary", "c", "train complementary?", false, false, String.valueOf(false)));
addOption("labelIndex", "li", "The path to store the label index in", false);
addOption(DefaultOptionCreator.overwriteOption().create());
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
@@ -78,15 +79,15 @@ public final class TrainNaiveBayesJob ex
HadoopUtil.delete(getConf(), getTempPath());
}
Path labPath;
- String labPathStr = parsedArgs.get("--labelIndex");
+ String labPathStr = getOption("labelIndex");
if (labPathStr != null) {
labPath = new Path(labPathStr);
} else {
labPath = getTempPath("labelIndex");
}
- long labelSize = createLabelIndex(parsedArgs, labPath);
- float alphaI = Float.parseFloat(parsedArgs.get("--alphaI"));
- boolean trainComplementary = Boolean.parseBoolean(parsedArgs.get("--trainComplementary"));
+ long labelSize = createLabelIndex(labPath);
+ float alphaI = Float.parseFloat(getOption("alphaI"));
+ boolean trainComplementary = Boolean.parseBoolean(getOption("trainComplementary"));
HadoopUtil.setSerializations(getConf());
@@ -132,12 +133,12 @@ public final class TrainNaiveBayesJob ex
return 0;
}
- private long createLabelIndex(Map<String, String> parsedArgs, Path labPath) throws IOException {
+ private long createLabelIndex(Path labPath) throws IOException {
long labelSize = 0;
- if (parsedArgs.containsKey("--labels")) {
- Iterable<String> labels = Splitter.on(",").split(parsedArgs.get("--labels"));
+ if (hasOption("labels")) {
+ Iterable<String> labels = Splitter.on(",").split(getOption("labels"));
labelSize = BayesUtils.writeLabelIndex(getConf(), labels, labPath);
- } else if (parsedArgs.containsKey("--extractLabels")) {
+ } else if (hasOption("extractLabels")) {
SequenceFileDirIterable<Text, IntWritable> iterable =
new SequenceFileDirIterable<Text, IntWritable>(getInputPath(), PathType.LIST, PathFilters.logsCRCFilter(), getConf());
labelSize = BayesUtils.writeLabelIndex(getConf(), labPath, iterable);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/eigencuts/EigencutsDriver.java Mon Feb 13 15:14:18 2012
@@ -64,22 +64,22 @@ public class EigencutsDriver extends Abs
addOption(DefaultOptionCreator.inputOption().create());
addOption(DefaultOptionCreator.outputOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
- Map<String, String> parsedArgs = parseArguments(arg0);
+ Map<String, List<String>> parsedArgs = parseArguments(arg0);
if (parsedArgs == null) {
return 0;
}
// read in the command line values
- Path input = new Path(parsedArgs.get("--input"));
- Path output = new Path(parsedArgs.get("--output"));
+ Path input = getInputPath();
+ Path output = getOutputPath();
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), output);
}
- int dimensions = Integer.parseInt(parsedArgs.get("--dimensions"));
- double halflife = Double.parseDouble(parsedArgs.get("--half-life"));
- double epsilon = Double.parseDouble(parsedArgs.get("--epsilon"));
- double tau = Double.parseDouble(parsedArgs.get("--tau"));
- int eigenrank = Integer.parseInt(parsedArgs.get("--eigenrank"));
+ int dimensions = Integer.parseInt(getOption("dimensions"));
+ double halflife = Double.parseDouble(getOption("half-life"));
+ double epsilon = Double.parseDouble(getOption("epsilon"));
+ double tau = Double.parseDouble(getOption("tau"));
+ int eigenrank = Integer.parseInt(getOption("eigenrank"));
run(getConf(), input, output, eigenrank, dimensions, halflife, epsilon, tau);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/clustering/spectral/kmeans/SpectralKMeansDriver.java Mon Feb 13 15:14:18 2012
@@ -39,6 +39,7 @@ import org.apache.mahout.math.hadoop.dec
import org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
/**
@@ -65,7 +66,7 @@ public class SpectralKMeansDriver extend
addOption(DefaultOptionCreator.convergenceOption().create());
addOption(DefaultOptionCreator.maxIterationsOption().create());
addOption(DefaultOptionCreator.overwriteOption().create());
- Map<String, String> parsedArgs = parseArguments(arg0);
+ Map<String, List<String>> parsedArgs = parseArguments(arg0);
if (parsedArgs == null) {
return 0;
}
@@ -75,8 +76,8 @@ public class SpectralKMeansDriver extend
if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(conf, output);
}
- int numDims = Integer.parseInt(parsedArgs.get("--dimensions"));
- int clusters = Integer.parseInt(parsedArgs.get("--clusters"));
+ int numDims = Integer.parseInt(getOption("dimensions"));
+ int clusters = Integer.parseInt(getOption("clusters"));
String measureClass = getOption(DefaultOptionCreator.DISTANCE_MEASURE_OPTION);
DistanceMeasure measure = ClassUtils.instantiateAs(measureClass, DistanceMeasure.class);
double convergenceDelta = Double.parseDouble(getOption(DefaultOptionCreator.CONVERGENCE_DELTA_OPTION));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java Mon Feb 13 15:14:18 2012
@@ -17,7 +17,10 @@
package org.apache.mahout.common;
+import java.io.File;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -92,14 +95,16 @@ public abstract class AbstractJob extend
/** input path, populated by {@link #parseArguments(String[])} */
private Path inputPath;
+ private File inputFile;//the input represented as a file
/** output path, populated by {@link #parseArguments(String[]) */
private Path outputPath;
+ private File outputFile;//the output represented as a file
/** temp path, populated by {@link #parseArguments(String[]) */
private Path tempPath;
- private Map<String, String> argMap;
+ private Map<String, List<String>> argMap;
/** internal list of options that have been added */
private final List<Option> options;
@@ -131,6 +136,14 @@ public abstract class AbstractJob extend
return new Path(outputPath, path);
}
+ protected File getInputFile(){
+ return inputFile;
+ }
+
+ protected File getOutputFile(){
+ return outputFile;
+ }
+
protected Path getTempPath() {
return tempPath;
@@ -217,6 +230,8 @@ public abstract class AbstractJob extend
this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
}
+
+
/** Build an option with the given parameters. Name and description are
* required.
*
@@ -235,6 +250,16 @@ public abstract class AbstractJob extend
boolean required,
String defaultValue) {
+ return buildOption(name, shortName, description, hasArg, 1, 1, required, defaultValue);
+ }
+
+ protected static Option buildOption(String name,
+ String shortName,
+ String description,
+ boolean hasArg, int min, int max,
+ boolean required,
+ String defaultValue) {
+
DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
.withRequired(required);
@@ -243,7 +268,7 @@ public abstract class AbstractJob extend
}
if (hasArg) {
- ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(1).withMaximum(1);
+ ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(min).withMaximum(max);
if (defaultValue != null) {
argBuilder = argBuilder.withDefault(defaultValue);
@@ -285,7 +310,7 @@ public abstract class AbstractJob extend
*
*
*/
- public Map<String, String> parseArguments(String[] args) throws IOException {
+ public Map<String, List<String>> parseArguments(String[] args) throws IOException {
Option helpOpt = addOption(DefaultOptionCreator.helpOption());
addOption("tempDir", null, "Intermediate output directory", "temp");
@@ -326,12 +351,14 @@ public abstract class AbstractJob extend
return null;
}
- argMap = new TreeMap<String, String>();
+ argMap = new TreeMap<String, List<String>>();
maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));
- this.tempPath = new Path(argMap.get("--tempDir"));
+ this.tempPath = new Path(getOption("tempDir"));
- log.info("Command line arguments: {}", argMap);
+ if (!hasOption("quiet")){
+ log.info("Command line arguments: {}", argMap);
+ }
return argMap;
}
@@ -346,7 +373,11 @@ public abstract class AbstractJob extend
* @return the requested option, or null if it has not been specified
*/
public String getOption(String optionName) {
- return argMap.get(keyFor(optionName));
+ List<String> list = argMap.get(keyFor(optionName));
+ if (list != null && list.isEmpty() == false){
+ return list.get(0);
+ }
+ return null;
}
/**
@@ -364,6 +395,15 @@ public abstract class AbstractJob extend
}
/**
+ * Options can occur multiple times, so return the list
+ * @param optionName The unadorned (no "--" prefixing it) option name
+ * @return The values, else null. If the option is present, but has no values, then the result will be an empty list (Collections.emptyList())
+ */
+ public List<String> getOptions(String optionName){
+ return argMap.get(keyFor(optionName));
+ }
+
+ /**
* @return if the requested option has been specified
*/
public boolean hasOption(String optionName) {
@@ -390,6 +430,7 @@ public abstract class AbstractJob extend
if (inputOption != null && cmdLine.hasOption(inputOption)) {
this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
+ this.inputFile = new File(cmdLine.getValue(inputOption).toString());
}
if (inputPath == null && conf.get("mapred.input.dir") != null) {
this.inputPath = new Path(conf.get("mapred.input.dir"));
@@ -397,6 +438,7 @@ public abstract class AbstractJob extend
if (outputOption != null && cmdLine.hasOption(outputOption)) {
this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
+ this.outputFile = new File(cmdLine.getValue(outputOption).toString());
}
if (outputPath == null && conf.get("mapred.output.dir") != null) {
this.outputPath = new Path(conf.get("mapred.output.dir"));
@@ -408,25 +450,47 @@ public abstract class AbstractJob extend
"No output specified: or -Dmapred.output.dir must be provided to specify output directory");
}
- protected static void maybePut(Map<String, String> args, CommandLine cmdLine, Option... opt) {
+ protected static void maybePut(Map<String, List<String>> args, CommandLine cmdLine, Option... opt) {
for (Option o : opt) {
// the option appeared on the command-line, or it has a value
// (which is likely a default value).
- if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null) {
+ if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null || (cmdLine.getValues(o) != null && cmdLine.getValues(o).isEmpty() == false)) {
// nulls are ok, for cases where options are simple flags.
- Object vo = cmdLine.getValue(o);
- String value = vo == null ? null : vo.toString();
- args.put(o.getPreferredName(), value);
+ List vo = cmdLine.getValues(o);
+ if (vo != null && vo.isEmpty() == false){
+ List<String> vals = new ArrayList<String>();
+ for (Object o1 : vo) {
+ vals.add(o1.toString());
+ }
+ args.put(o.getPreferredName(), vals);
+ } else {
+ args.put(o.getPreferredName(), null);
+ }
}
}
}
- protected static boolean shouldRunNextPhase(Map<String, String> args, AtomicInteger currentPhase) {
+ /**
+ *
+ * @param args The input argument map
+ * @param optName The adorned (including "--") option name
+ * @return The first value in the match, else null
+ */
+ public static String getOption(Map<String, List<String>> args, String optName){
+ List<String> res = args.get(optName);
+ if (res != null && res.isEmpty() == false){
+ return res.get(0);
+ }
+ return null;
+ }
+
+
+ protected static boolean shouldRunNextPhase(Map<String, List<String>> args, AtomicInteger currentPhase) {
int phase = currentPhase.getAndIncrement();
- String startPhase = args.get("--startPhase");
- String endPhase = args.get("--endPhase");
+ String startPhase = getOption(args, "--startPhase");
+ String endPhase = getOption(args, "--endPhase");
boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
|| (endPhase != null && phase > Integer.parseInt(endPhase));
if (phaseSkipped) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/graph/AdjacencyMatrixJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/graph/AdjacencyMatrixJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/graph/AdjacencyMatrixJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/graph/AdjacencyMatrixJob.java Mon Feb 13 15:14:18 2012
@@ -47,6 +47,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
+import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
@@ -94,14 +95,14 @@ public class AdjacencyMatrixJob extends
addOutputOption();
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- Path vertices = new Path(parsedArgs.get("--vertices"));
- Path edges = new Path(parsedArgs.get("--edges"));
- boolean symmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));
+ Path vertices = new Path(getOption("vertices"));
+ Path edges = new Path(getOption("edges"));
+ boolean symmetric = Boolean.parseBoolean(getOption("symmetric"));
log.info("Indexing vertices sequentially, this might take a while...");
int numVertices = indexVertices(vertices, getOutputPath(VERTEX_INDEX));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalk.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalk.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalk.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalk.java Mon Feb 13 15:14:18 2012
@@ -46,6 +46,7 @@ import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
abstract class RandomWalk extends AbstractJob {
@@ -58,7 +59,7 @@ abstract class RandomWalk extends Abstra
protected abstract Vector createDampingVector(int numVertices, double stayingProbability);
protected void addSpecificOptions() {}
- protected void evaluateSpecificOptions(Map<String, String> parsedArgs) {}
+ protected void evaluateSpecificOptions() {}
@Override
public final int run(String[] args) throws Exception {
@@ -70,15 +71,15 @@ abstract class RandomWalk extends Abstra
addSpecificOptions();
- Map<String, String> parsedArgs = parseArguments(args);
+ Map<String, List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- evaluateSpecificOptions(parsedArgs);
+ evaluateSpecificOptions();
- int numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
- double stayingProbability = Double.parseDouble(parsedArgs.get("--stayingProbability"));
+ int numIterations = Integer.parseInt(getOption("numIterations"));
+ double stayingProbability = Double.parseDouble(getOption("stayingProbability"));
Preconditions.checkArgument(numIterations > 0);
Preconditions.checkArgument(stayingProbability > 0.0 && stayingProbability <= 1.0);
@@ -89,8 +90,8 @@ abstract class RandomWalk extends Abstra
Path numVerticesPath = getTempPath(AdjacencyMatrixJob.NUM_VERTICES);
/* create the adjacency matrix */
- ToolRunner.run(getConf(), new AdjacencyMatrixJob(), new String[] { "--vertices", parsedArgs.get("--vertices"),
- "--edges", parsedArgs.get("--edges"), "--output", getTempPath().toString() });
+ ToolRunner.run(getConf(), new AdjacencyMatrixJob(), new String[] { "--vertices", getOption("vertices"),
+ "--edges", getOption("edges"), "--output", getTempPath().toString() });
int numVertices = HadoopUtil.readInt(numVerticesPath, getConf());
Preconditions.checkArgument(numVertices > 0);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalkWithRestartJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalkWithRestartJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalkWithRestartJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/graph/linkanalysis/RandomWalkWithRestartJob.java Mon Feb 13 15:14:18 2012
@@ -21,6 +21,8 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
+
+import java.util.List;
import java.util.Map;
/**
@@ -64,8 +66,8 @@ public class RandomWalkWithRestartJob ex
}
@Override
- protected void evaluateSpecificOptions(Map<String, String> parsedArgs) {
- sourceVertexIndex = Integer.parseInt(parsedArgs.get("--sourceVertexIndex"));
+ protected void evaluateSpecificOptions() {
+ sourceVertexIndex = Integer.parseInt(getOption("sourceVertexIndex"));
}
}
\ No newline at end of file
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/MatrixMultiplicationJob.java Mon Feb 13 15:14:18 2012
@@ -41,6 +41,7 @@ import org.apache.mahout.math.function.F
import java.io.IOException;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
public class MatrixMultiplicationJob extends AbstractJob {
@@ -90,19 +91,19 @@ public class MatrixMultiplicationJob ext
addOption("inputPathA", "ia", "Path to the first input matrix", true);
addOption("inputPathB", "ib", "Path to the second input matrix", true);
- Map<String, String> argMap = parseArguments(strings);
+ Map<String, List<String>> argMap = parseArguments(strings);
if (argMap == null) {
return -1;
}
- DistributedRowMatrix a = new DistributedRowMatrix(new Path(argMap.get("--inputPathA")),
- new Path(argMap.get("--tempDir")),
- Integer.parseInt(argMap.get("--numRowsA")),
- Integer.parseInt(argMap.get("--numColsA")));
- DistributedRowMatrix b = new DistributedRowMatrix(new Path(argMap.get("--inputPathB")),
- new Path(argMap.get("--tempDir")),
- Integer.parseInt(argMap.get("--numRowsB")),
- Integer.parseInt(argMap.get("--numColsB")));
+ DistributedRowMatrix a = new DistributedRowMatrix(new Path(getOption("inputPathA")),
+ new Path(getOption("tempDir")),
+ Integer.parseInt(getOption("numRowsA")),
+ Integer.parseInt(getOption("numColsA")));
+ DistributedRowMatrix b = new DistributedRowMatrix(new Path(getOption("inputPathB")),
+ new Path(getOption("tempDir")),
+ Integer.parseInt(getOption("numRowsB")),
+ Integer.parseInt(getOption("numColsB")));
a.setConf(new Configuration(getConf()));
b.setConf(new Configuration(getConf()));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/TransposeJob.java Mon Feb 13 15:14:18 2012
@@ -41,6 +41,7 @@ import org.apache.mahout.math.VectorWrit
import java.io.IOException;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
/**
@@ -59,13 +60,13 @@ public class TransposeJob extends Abstra
addInputOption();
addOption("numRows", "nr", "Number of rows of the input matrix");
addOption("numCols", "nc", "Number of columns of the input matrix");
- Map<String, String> parsedArgs = parseArguments(strings);
+ Map<String, List<String>> parsedArgs = parseArguments(strings);
if (parsedArgs == null) {
return -1;
}
- int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
- int numCols = Integer.parseInt(parsedArgs.get("--numCols"));
+ int numRows = Integer.parseInt(getOption("numRows"));
+ int numCols = Integer.parseInt(getOption("numCols"));
DistributedRowMatrix matrix = new DistributedRowMatrix(getInputPath(), getTempPath(), numRows, numCols);
matrix.setConf(new Configuration(getConf()));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/DistributedLanczosSolver.java Mon Feb 13 15:14:18 2012
@@ -40,6 +40,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
public class DistributedLanczosSolver extends LanczosSolver implements Tool {
@@ -50,7 +51,7 @@ public class DistributedLanczosSolver ex
private Configuration conf;
- private Map<String, String> parsedArgs;
+ private Map<String, List<String>> parsedArgs;
/**
* For the distributed case, the best guess at a useful initialization state for Lanczos we'll chose to be
@@ -93,21 +94,21 @@ public class DistributedLanczosSolver ex
@Override
public int run(String[] strings) throws Exception {
- Path inputPath = new Path(parsedArgs.get("--input"));
- Path outputPath = new Path(parsedArgs.get("--output"));
- Path outputTmpPath = new Path(parsedArgs.get("--tempDir"));
- Path workingDirPath = parsedArgs.get("--workingDir") != null
- ? new Path(parsedArgs.get("--workingDir")) : null;
- int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
- int numCols = Integer.parseInt(parsedArgs.get("--numCols"));
- boolean isSymmetric = Boolean.parseBoolean(parsedArgs.get("--symmetric"));
- int desiredRank = Integer.parseInt(parsedArgs.get("--rank"));
+ Path inputPath = new Path(AbstractJob.getOption(parsedArgs, "--input"));
+ Path outputPath = new Path(AbstractJob.getOption(parsedArgs, "--output"));
+ Path outputTmpPath = new Path(AbstractJob.getOption(parsedArgs, "--tempDir"));
+ Path workingDirPath = AbstractJob.getOption(parsedArgs, "--workingDir") != null
+ ? new Path(AbstractJob.getOption(parsedArgs, "--workingDir")) : null;
+ int numRows = Integer.parseInt(AbstractJob.getOption(parsedArgs, "--numRows"));
+ int numCols = Integer.parseInt(AbstractJob.getOption(parsedArgs, "--numCols"));
+ boolean isSymmetric = Boolean.parseBoolean(AbstractJob.getOption(parsedArgs, "--symmetric"));
+ int desiredRank = Integer.parseInt(AbstractJob.getOption(parsedArgs, "--rank"));
- boolean cleansvd = Boolean.parseBoolean(parsedArgs.get("--cleansvd"));
+ boolean cleansvd = Boolean.parseBoolean(AbstractJob.getOption(parsedArgs, "--cleansvd"));
if (cleansvd) {
- double maxError = Double.parseDouble(parsedArgs.get("--maxError"));
- double minEigenvalue = Double.parseDouble(parsedArgs.get("--minEigenvalue"));
- boolean inMemory = Boolean.parseBoolean(parsedArgs.get("--inMemory"));
+ double maxError = Double.parseDouble(AbstractJob.getOption(parsedArgs, "--maxError"));
+ double minEigenvalue = Double.parseDouble(AbstractJob.getOption(parsedArgs, "--minEigenvalue"));
+ boolean inMemory = Boolean.parseBoolean(AbstractJob.getOption(parsedArgs, "--inMemory"));
return run(inputPath,
outputPath,
outputTmpPath,
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/decomposer/EigenVerificationJob.java Mon Feb 13 15:14:18 2012
@@ -99,7 +99,7 @@ public class EigenVerificationJob extend
@Override
public int run(String[] args) throws Exception {
- Map<String, String> argMap = handleArgs(args);
+ Map<String, List<String>> argMap = handleArgs(args);
if (argMap == null) {
return -1;
}
@@ -108,13 +108,13 @@ public class EigenVerificationJob extend
}
// parse out the arguments
runJob(getConf(),
- new Path(argMap.get("--eigenInput")),
- new Path(argMap.get("--corpusInput")),
+ new Path(getOption("eigenInput")),
+ new Path(getOption("corpusInput")),
getOutputPath(),
- argMap.get("--inMemory") != null,
- Double.parseDouble(argMap.get("--maxError")),
- //Double.parseDouble(argMap.get("--minEigenvalue")),
- Integer.parseInt(argMap.get("--maxEigens")));
+ getOption("inMemory") != null,
+ Double.parseDouble(getOption("maxError")),
+ //Double.parseDouble(getOption("minEigenvalue")),
+ Integer.parseInt(getOption("maxEigens")));
return 0;
}
@@ -165,7 +165,7 @@ public class EigenVerificationJob extend
return 0;
}
- private Map<String, String> handleArgs(String[] args) throws IOException {
+ private Map<String, List<String>> handleArgs(String[] args) throws IOException {
addOutputOption();
addOption("eigenInput",
"ei",
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java Mon Feb 13 15:14:18 2012
@@ -41,6 +41,7 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
@@ -84,13 +85,13 @@ public class RowSimilarityJob extends Ab
addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false));
addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
- Map<String,String> parsedArgs = parseArguments(args);
+ Map<String,List<String>> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
- int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
- String similarityClassnameArg = parsedArgs.get("--similarityClassname");
+ int numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
+ String similarityClassnameArg = getOption("similarityClassname");
String similarityClassname;
try {
similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
@@ -98,10 +99,10 @@ public class RowSimilarityJob extends Ab
similarityClassname = similarityClassnameArg;
}
- int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));
- boolean excludeSelfSimilarity = Boolean.parseBoolean(parsedArgs.get("--excludeSelfSimilarity"));
- double threshold = parsedArgs.containsKey("--threshold") ?
- Double.parseDouble(parsedArgs.get("--threshold")) : NO_THRESHOLD;
+ int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
+ boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
+ double threshold = hasOption("threshold") ?
+ Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;
Path weightsPath = getTempPath("weights");
Path normsPath = getTempPath("norms.bin");
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/solver/DistributedConjugateGradientSolver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/solver/DistributedConjugateGradientSolver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/solver/DistributedConjugateGradientSolver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/solver/DistributedConjugateGradientSolver.java Mon Feb 13 15:14:18 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.math.hadoop.solver;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
@@ -41,7 +42,7 @@ import org.apache.mahout.math.solver.Pre
public class DistributedConjugateGradientSolver extends ConjugateGradientSolver implements Tool {
private Configuration conf;
- private Map<String, String> parsedArgs;
+ private Map<String, List<String>> parsedArgs;
/**
*
@@ -85,15 +86,15 @@ public class DistributedConjugateGradien
@Override
public int run(String[] strings) throws Exception {
- Path inputPath = new Path(parsedArgs.get("--input"));
- Path outputPath = new Path(parsedArgs.get("--output"));
- Path tempPath = new Path(parsedArgs.get("--tempDir"));
- Path vectorPath = new Path(parsedArgs.get("--vector"));
- int numRows = Integer.parseInt(parsedArgs.get("--numRows"));
- int numCols = Integer.parseInt(parsedArgs.get("--numCols"));
- int maxIterations = parsedArgs.containsKey("--maxIter") ? Integer.parseInt(parsedArgs.get("--maxIter")) : numCols;
+ Path inputPath = new Path(AbstractJob.getOption(parsedArgs, "--input"));
+ Path outputPath = new Path(AbstractJob.getOption(parsedArgs, "--output"));
+ Path tempPath = new Path(AbstractJob.getOption(parsedArgs, "--tempDir"));
+ Path vectorPath = new Path(AbstractJob.getOption(parsedArgs, "--vector"));
+ int numRows = Integer.parseInt(AbstractJob.getOption(parsedArgs, "--numRows"));
+ int numCols = Integer.parseInt(AbstractJob.getOption(parsedArgs, "--numCols"));
+ int maxIterations = parsedArgs.containsKey("--maxIter") ? Integer.parseInt(AbstractJob.getOption(parsedArgs, "--maxIter")) : numCols;
double maxError = parsedArgs.containsKey("--maxError")
- ? Double.parseDouble(parsedArgs.get("--maxError"))
+ ? Double.parseDouble(AbstractJob.getOption(parsedArgs, "--maxError"))
: ConjugateGradientSolver.DEFAULT_MAX_ERROR;
Vector b = loadInputVector(vectorPath);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDCli.java Mon Feb 13 15:14:18 2012
@@ -18,6 +18,7 @@ package org.apache.mahout.math.hadoop.st
import java.io.IOException;
import java.util.Arrays;
+import java.util.List;
import java.util.Map;
import com.google.common.io.Closeables;
@@ -82,24 +83,24 @@ public class SSVDCli extends AbstractJob
String.valueOf(true));
addOption(DefaultOptionCreator.overwriteOption().create());
- Map<String, String> pargs = parseArguments(args);
+ Map<String, List<String>> pargs = parseArguments(args);
if (pargs == null) {
return -1;
}
- int k = Integer.parseInt(pargs.get("--rank"));
- int p = Integer.parseInt(pargs.get("--oversampling"));
- int r = Integer.parseInt(pargs.get("--blockHeight"));
- int h = Integer.parseInt(pargs.get("--outerProdBlockHeight"));
- int abh = Integer.parseInt(pargs.get("--abtBlockHeight"));
- int q = Integer.parseInt(pargs.get("--powerIter"));
- int minSplitSize = Integer.parseInt(pargs.get("--minSplitSize"));
- boolean computeU = Boolean.parseBoolean(pargs.get("--computeU"));
- boolean computeV = Boolean.parseBoolean(pargs.get("--computeV"));
- boolean cUHalfSigma = Boolean.parseBoolean(pargs.get("--uHalfSigma"));
- boolean cVHalfSigma = Boolean.parseBoolean(pargs.get("--vHalfSigma"));
- int reduceTasks = Integer.parseInt(pargs.get("--reduceTasks"));
- boolean broadcast = Boolean.parseBoolean(pargs.get("--broadcast"));
+ int k = Integer.parseInt(getOption("rank"));
+ int p = Integer.parseInt(getOption("oversampling"));
+ int r = Integer.parseInt(getOption("blockHeight"));
+ int h = Integer.parseInt(getOption("outerProdBlockHeight"));
+ int abh = Integer.parseInt(getOption("abtBlockHeight"));
+ int q = Integer.parseInt(getOption("powerIter"));
+ int minSplitSize = Integer.parseInt(getOption("minSplitSize"));
+ boolean computeU = Boolean.parseBoolean(getOption("computeU"));
+ boolean computeV = Boolean.parseBoolean(getOption("computeV"));
+ boolean cUHalfSigma = Boolean.parseBoolean(getOption("uHalfSigma"));
+ boolean cVHalfSigma = Boolean.parseBoolean(getOption("vHalfSigma"));
+ int reduceTasks = Integer.parseInt(getOption("reduceTasks"));
+ boolean broadcast = Boolean.parseBoolean(getOption("broadcast"));
boolean overwrite =
pargs.containsKey(keyFor(DefaultOptionCreator.OVERWRITE_OPTION));
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/stats/entropy/Entropy.java Mon Feb 13 15:14:18 2012
@@ -31,6 +31,7 @@ import org.apache.mahout.common.Abstract
import org.apache.mahout.math.VarIntWritable;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
/**
@@ -88,8 +89,11 @@ public final class Entropy extends Abstr
addOption("source", "s", "Sets, if the entropy is calculated for the keys or the values. Can be <key> or <value>"
, "key");
- Map<String, String> arguments = parseArguments(args);
- source = arguments.get("--source");
+ Map<String, List<String>> arguments = parseArguments(args);
+ if (arguments == null){
+ return;
+ }
+ source = getOption("source");
tempPath = new Path(getTempPath(), TEMP_FILE + '-' + System.currentTimeMillis());
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/collocations/llr/CollocDriver.java Mon Feb 13 15:14:18 2012
@@ -18,6 +18,7 @@
package org.apache.mahout.vectorizer.collocations.llr;
import java.io.IOException;
+import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
@@ -84,7 +85,7 @@ public final class CollocDriver extends
+ " which will be tokenized using the specified analyzer.");
addFlag("unigram", "u", "If set, unigrams will be emitted in the final output alongside collocations");
- Map<String, String> argMap = parseArguments(args);
+ Map<String, List<String>> argMap = parseArguments(args);
if (argMap == null) {
return -1;
@@ -94,45 +95,45 @@ public final class CollocDriver extends
Path output = getOutputPath();
int maxNGramSize = DEFAULT_MAX_NGRAM_SIZE;
- if (argMap.get("--maxNGramSize") != null) {
+ if (hasOption("maxNGramSize")) {
try {
- maxNGramSize = Integer.parseInt(argMap.get("--maxNGramSize"));
+ maxNGramSize = Integer.parseInt(getOption("maxNGramSize"));
} catch (NumberFormatException ex) {
log.warn("Could not parse ngram size option");
}
}
log.info("Maximum n-gram size is: {}", maxNGramSize);
- if (argMap.containsKey("--overwrite")) {
+ if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
HadoopUtil.delete(getConf(), output);
}
int minSupport = CollocReducer.DEFAULT_MIN_SUPPORT;
- if (argMap.get("--minSupport") != null) {
- minSupport = Integer.parseInt(argMap.get("--minSupport"));
+ if (getOption("minSupport") != null) {
+ minSupport = Integer.parseInt(getOption("minSupport"));
}
log.info("Minimum Support value: {}", minSupport);
float minLLRValue = LLRReducer.DEFAULT_MIN_LLR;
- if (argMap.get("--minLLR") != null) {
- minLLRValue = Float.parseFloat(argMap.get("--minLLR"));
+ if (getOption("minLLR") != null) {
+ minLLRValue = Float.parseFloat(getOption("minLLR"));
}
log.info("Minimum LLR value: {}", minLLRValue);
int reduceTasks = DEFAULT_PASS1_NUM_REDUCE_TASKS;
- if (argMap.get("--maxRed") != null) {
- reduceTasks = Integer.parseInt(argMap.get("--maxRed"));
+ if (getOption("maxRed") != null) {
+ reduceTasks = Integer.parseInt(getOption("maxRed"));
}
log.info("Number of pass1 reduce tasks: {}", reduceTasks);
- boolean emitUnigrams = argMap.containsKey("--emitUnigrams");
+ boolean emitUnigrams = argMap.containsKey("emitUnigrams");
- if (argMap.containsKey("--preprocess")) {
+ if (argMap.containsKey("preprocess")) {
log.info("Input will be preprocessed");
Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
- if (argMap.get("--analyzerName") != null) {
- String className = argMap.get("--analyzerName");
+ if (getOption("analyzerName") != null) {
+ String className = getOption("analyzerName");
analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
// try instantiating it, b/c there isn't any point in setting it if
// you can't instantiate it
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/common/AbstractJobTest.java Mon Feb 13 15:14:18 2012
@@ -20,6 +20,8 @@
package org.apache.mahout.common;
import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
import java.util.Map;
import com.google.common.collect.Maps;
@@ -36,7 +38,7 @@ public final class AbstractJobTest exten
@Test
public void testFlag() throws Exception {
- final Map<String,String> testMap = Maps.newHashMap();
+ final Map<String,List<String>> testMap = Maps.newHashMap();
AbstractJobFactory fact = new AbstractJobFactory() {
@Override
@@ -46,7 +48,7 @@ public final class AbstractJobTest exten
public int run(String[] args) throws IOException {
addFlag("testFlag", "t", "a simple test flag");
- Map<String,String> argMap = parseArguments(args);
+ Map<String,List<String>> argMap = parseArguments(args);
testMap.clear();
testMap.putAll(argMap);
return 1;
@@ -67,7 +69,7 @@ public final class AbstractJobTest exten
@Test
public void testOptions() throws Exception {
- final Map<String,String> testMap = Maps.newHashMap();
+ final Map<String,List<String>> testMap = Maps.newHashMap();
AbstractJobFactory fact = new AbstractJobFactory() {
@Override
@@ -82,7 +84,7 @@ public final class AbstractJobTest exten
this.addOption("hasDefault", "hd", "option w/ default", "defaultValue");
- Map<String,String> argMap = parseArguments(args);
+ Map<String,List<String>> argMap = parseArguments(args);
if (argMap == null) {
return -1;
}
@@ -103,8 +105,8 @@ public final class AbstractJobTest exten
"--required", "requiredArg"
});
assertEquals("0 for no missing required options", 0, ret);
- assertEquals("requiredArg", testMap.get("--required"));
- assertEquals("defaultValue", testMap.get("--hasDefault"));
+ assertEquals(Collections.singletonList("requiredArg"), testMap.get("--required"));
+ assertEquals(Collections.singletonList("defaultValue"), testMap.get("--hasDefault"));
assertNull(testMap.get("--option"));
assertNull(testMap.get("--notRequired"));
assertFalse(testMap.containsKey("--overwrite"));
@@ -129,10 +131,10 @@ public final class AbstractJobTest exten
"--notRequired", "notRequired"
});
assertEquals("0 for no missing required options", 0, ret);
- assertEquals("requiredArg", testMap.get("--required"));
- assertEquals("nonDefault", testMap.get("--hasDefault"));
- assertEquals("optionValue", testMap.get("--option"));
- assertEquals("notRequired", testMap.get("--notRequired"));
+ assertEquals(Collections.singletonList("requiredArg"), testMap.get("--required"));
+ assertEquals(Collections.singletonList("nonDefault"), testMap.get("--hasDefault"));
+ assertEquals(Collections.singletonList("optionValue"), testMap.get("--option"));
+ assertEquals(Collections.singletonList("notRequired"), testMap.get("--notRequired"));
assertTrue(testMap.containsKey("--overwrite"));
ret = ToolRunner.run(fact.getJob(), new String[]{
@@ -143,10 +145,10 @@ public final class AbstractJobTest exten
"-nr", "notRequired"
});
assertEquals("0 for no missing required options", 0, ret);
- assertEquals("requiredArg", testMap.get("--required"));
- assertEquals("nonDefault", testMap.get("--hasDefault"));
- assertEquals("optionValue", testMap.get("--option"));
- assertEquals("notRequired", testMap.get("--notRequired"));
+ assertEquals(Collections.singletonList("requiredArg"), testMap.get("--required"));
+ assertEquals(Collections.singletonList("nonDefault"), testMap.get("--hasDefault"));
+ assertEquals(Collections.singletonList("optionValue"), testMap.get("--option"));
+ assertEquals(Collections.singletonList("notRequired"), testMap.get("--notRequired"));
assertTrue(testMap.containsKey("--overwrite"));
}
@@ -164,7 +166,7 @@ public final class AbstractJobTest exten
addOutputOption();
// arg map should be null if a required option is missing.
- Map<String, String> argMap = parseArguments(args);
+ Map<String, List<String>> argMap = parseArguments(args);
if (argMap == null) {
return -1;
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java?rev=1243556&r1=1243555&r2=1243556&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SequenceFileDumper.java Mon Feb 13 15:14:18 2012
@@ -23,8 +23,11 @@ import com.google.common.io.Files;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
@@ -47,20 +50,36 @@ public final class SequenceFileDumper ex
@Override
public int run(String[] args) throws Exception {
- addOption("seqFile", "s", "The Sequence File to read in", true);
+ addOption("seqFile", "s", "The Sequence File to read in", false);
+ addOption("seqDirectory", "d", "A directory containing sequence files to read", false);
addOption(DefaultOptionCreator.outputOption().create());
addOption("substring", "b", "The number of chars to print out per value", false);
addOption(buildOption("count", "c", "Report the count only", false, false, null));
addOption("numItems", "n", "Output at most <n> key value pairs", false);
addOption(buildOption("facets", "fa", "Output the counts per key. Note, if there are a lot of unique keys, this can take up a fair amount of memory", false, false, null));
-
+ addOption(buildOption("quiet", "q", "Print only file contents.", false, false, null));
if (parseArguments(args) == null) {
return -1;
}
- Path path = new Path(getOption("seqFile"));
+
+ Path[] pathArr= null;
Configuration conf = new Configuration();
+ if (getOption("seqFile") != null) {
+ pathArr = new Path[1];
+ pathArr[0] = new Path(getOption("seqFile"));
+ } else if (getOption("seqDirectory") != null) {
+ Path dirPath = new Path(getOption("seqDirectory"));
+ FileSystem fs = dirPath.getFileSystem(conf);
+ pathArr = FileUtil.stat2Paths(fs.listStatus(dirPath, new OutputFilesFilter()));
+ }
+ if (pathArr == null) {
+ System.out.println("Must specify --seqFile (-s) or --seqDirectory (-d)!");
+ return -1;
+ }
+
+
Writer writer;
boolean shouldClose;
if (hasOption("output")) {
@@ -71,62 +90,69 @@ public final class SequenceFileDumper ex
writer = new OutputStreamWriter(System.out);
}
try {
- writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
-
- int sub = Integer.MAX_VALUE;
- if (hasOption("substring")) {
- sub = Integer.parseInt(getOption("substring"));
- }
- boolean countOnly = hasOption("count");
- SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf);
- writer.append("Key class: ").append(iterator.getKeyClass().toString());
- writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
- OpenObjectIntHashMap<String> facets = null;
- if (hasOption("facets")){
- facets = new OpenObjectIntHashMap<String>();
- }
- long count = 0;
- if (countOnly) {
- while (iterator.hasNext()) {
- Pair<?, ?> record = iterator.next();
- String key = record.getFirst().toString();
- if (facets != null){
- facets.adjustOrPutValue(key, 1, 1);//either insert or add 1
- }
- count++;
+ for (Path path : pathArr) {
+ if (!hasOption("quiet"))
+ writer.append("Input Path: ").append(String.valueOf(path)).append('\n');
+
+ int sub = Integer.MAX_VALUE;
+ if (hasOption("substring")) {
+ sub = Integer.parseInt(getOption("substring"));
}
- writer.append("Count: ").append(String.valueOf(count)).append('\n');
- } else {
- long numItems = Long.MAX_VALUE;
- if (hasOption("numItems")) {
- numItems = Long.parseLong(getOption("numItems"));
- writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
+ boolean countOnly = hasOption("count");
+ SequenceFileIterator<?, ?> iterator = new SequenceFileIterator<Writable, Writable>(path, true, conf);
+ if (!hasOption("quiet")) {
+ writer.append("Key class: ").append(iterator.getKeyClass().toString());
+ writer.append(" Value Class: ").append(iterator.getValueClass().toString()).append('\n');
}
- while (iterator.hasNext() && count < numItems) {
- Pair<?, ?> record = iterator.next();
- String key = record.getFirst().toString();
- writer.append("Key: ").append(key);
- String str = record.getSecond().toString();
- writer.append(": Value: ").append(str.length() > sub ? str.substring(0, sub) : str);
- writer.write('\n');
- if (facets != null){
- facets.adjustOrPutValue(key, 1, 1);//either insert or add 1
+ OpenObjectIntHashMap<String> facets = null;
+ if (hasOption("facets")){
+ facets = new OpenObjectIntHashMap<String>();
+ }
+ long count = 0;
+ if (countOnly) {
+ while (iterator.hasNext()) {
+ Pair<?, ?> record = iterator.next();
+ String key = record.getFirst().toString();
+ if (facets != null){
+ facets.adjustOrPutValue(key, 1, 1);//either insert or add 1
+ }
+ count++;
+ }
+ writer.append("Count: ").append(String.valueOf(count)).append('\n');
+ } else {
+ long numItems = Long.MAX_VALUE;
+ if (hasOption("numItems")) {
+ numItems = Long.parseLong(getOption("numItems"));
+ if (!hasOption("quiet"))
+ writer.append("Max Items to dump: ").append(String.valueOf(numItems)).append("\n");
}
- count++;
+ while (iterator.hasNext() && count < numItems) {
+ Pair<?, ?> record = iterator.next();
+ String key = record.getFirst().toString();
+ writer.append("Key: ").append(key);
+ String str = record.getSecond().toString();
+ writer.append(": Value: ").append(str.length() > sub
+ ? str.substring(0, sub) : str);
+ writer.write('\n');
+ if (facets != null){
+ facets.adjustOrPutValue(key, 1, 1);//either insert or add 1
+ }
+ count++;
+ }
+ if (!hasOption("quiet"))
+ writer.append("Count: ").append(String.valueOf(count)).append('\n');
}
- writer.append("Count: ").append(String.valueOf(count)).append('\n');
- }
- if (facets != null) {
- List<String> keyList = new ArrayList<String>(facets.size());
-
- IntArrayList valueList = new IntArrayList(facets.size());
- facets.pairsSortedByKey(keyList, valueList);
- writer.append("-----Facets---\n");
- writer.append("Key\t\tCount\n");
- int i = 0;
- for (String key : keyList) {
- writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
+ if (facets != null) {
+ List<String> keyList = new ArrayList<String>(facets.size());
+ IntArrayList valueList = new IntArrayList(facets.size());
+ facets.pairsSortedByKey(keyList, valueList);
+ writer.append("-----Facets---\n");
+ writer.append("Key\t\tCount\n");
+ int i = 0;
+ for (String key : keyList) {
+ writer.append(key).append("\t\t").append(String.valueOf(valueList.get(i++))).append('\n');
+ }
}
}
writer.flush();