You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/10/15 16:08:39 UTC
svn commit: r1183642 [2/3] - in /mahout/trunk:
core/src/main/java/org/apache/mahout/cf/taste/hadoop/
core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/
core/src/main/java/org/apache/mahout/classifier/
core/src/main/java/org/apache/mahout/cl...
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/graph/preprocessing/AdjacencyMatrixJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/graph/preprocessing/AdjacencyMatrixJobTest.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/graph/preprocessing/AdjacencyMatrixJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/graph/preprocessing/AdjacencyMatrixJobTest.java Sat Oct 15 14:08:33 2011
@@ -32,22 +32,20 @@ import java.io.File;
public class AdjacencyMatrixJobTest extends GraphTestCase {
- File verticesFile;
- File edgesFile;
- File indexedVerticesFile;
- File outputDir;
- File tempDir;
-
- int numVertices;
- double stayingProbability;
- Matrix expectedAdjacencyMatrix;
-
- Configuration conf;
+ private File edgesFile;
+ private File indexedVerticesFile;
+ private File outputDir;
+ private File tempDir;
+ private int numVertices;
+ private double stayingProbability;
+ private Matrix expectedAdjacencyMatrix;
+ private Configuration conf;
+ @Override
@Before
public void setUp() throws Exception {
super.setUp();
- verticesFile = getTestTempFile("vertices.txt");
+ File verticesFile = getTestTempFile("vertices.txt");
edgesFile = getTestTempFile("edges.seq");
indexedVerticesFile = getTestTempFile("indexedVertices.seq");
outputDir = getTestTempDir("output");
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverDenseTest.java Sat Oct 15 14:08:33 2011
@@ -20,8 +20,6 @@ package org.apache.mahout.math.hadoop.st
import java.io.File;
import java.io.IOException;
-import junit.framework.Assert;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -165,8 +163,7 @@ public class LocalSSVDSolverDenseTest ex
// used to generate surrogate input
for (int i = 0; i < k; i++) {
- Assert
- .assertTrue(Math.abs((singularValues.getQuick(i) - stochasticSValues[i])
+ assertTrue(Math.abs((singularValues.getQuick(i) - stochasticSValues[i])
/ singularValues.getQuick(i)) <= s_precisionPct / 100);
}
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/LocalSSVDSolverSparseSequentialTest.java Sat Oct 15 14:08:33 2011
@@ -25,7 +25,6 @@ import java.util.LinkedList;
import java.util.Random;
import com.google.common.io.Closeables;
-import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
@@ -158,14 +157,11 @@ public class LocalSSVDSolverSparseSequen
SingularValueDecomposition svd2 =
new SingularValueDecomposition(new DenseMatrix(a));
- a = null;
-
double[] svalues2 = svd2.getSingularValues();
dumpSv(svalues2);
for (int i = 0; i < k + p; i++) {
- Assert
- .assertTrue(Math.abs(svalues2[i] - stochasticSValues[i]) <= s_epsilon);
+ assertTrue(Math.abs(svalues2[i] - stochasticSValues[i]) <= s_epsilon);
}
double[][] mQ =
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototypeTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototypeTest.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototypeTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDPrototypeTest.java Sat Oct 15 14:08:33 2011
@@ -19,8 +19,6 @@ package org.apache.mahout.math.hadoop.st
import java.util.Random;
-import junit.framework.Assert;
-
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.DenseMatrix;
@@ -92,17 +90,16 @@ public class SSVDPrototypeTest extends M
if (Math.abs(1 - norm) < epsilon) {
rank++;
} else {
- Assert.assertTrue(Math.abs(norm) < epsilon);
+ assertTrue(Math.abs(norm) < epsilon);
}
for (int j = 0; j <= i; j++) {
Vector e_j = mtx.viewColumn(j);
double dot = ei.dot(e_j);
- Assert
- .assertTrue(Math.abs((i == j && rank > j ? 1 : 0) - dot) < epsilon);
+ assertTrue(Math.abs((i == j && rank > j ? 1 : 0) - dot) < epsilon);
}
}
- Assert.assertTrue((!insufficientRank && rank == n) || (insufficientRank && rank < n));
+ assertTrue((!insufficientRank && rank == n) || (insufficientRank && rank < n));
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/BookCrossingDataModel.java Sat Oct 15 14:08:33 2011
@@ -68,7 +68,7 @@ public final class BookCrossingDataModel
continue;
}
// Delete replace anything that isn't numeric, or a semicolon delimiter. Make comma the delimiter.
- String convertedLine = BookCrossingDataModel.NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
+ String convertedLine = NON_DIGIT_SEMICOLON_PATTERN.matcher(line)
.replaceAll("").replace(';', ',');
// If this means we deleted an entire ID -- few cases like that -- skip the line
if (convertedLine.contains(",,")) {
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java Sat Oct 15 14:08:33 2011
@@ -1,5 +1,21 @@
-package org.apache.mahout.cf.taste.example.email;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.example.email;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
@@ -13,38 +29,34 @@ import org.apache.mahout.math.map.OpenOb
import java.io.IOException;
import java.net.URI;
+import java.util.regex.Pattern;
-/**
- *
- *
- **/
public final class EmailUtility {
+
public static final String SEPARATOR = "separator";
public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
public static final String FROM_PREFIX = "fromPrefix";
public static final String MSG_ID_DIMENSION = "msgIdDim";
public static final String FROM_INDEX = "fromIdx";
public static final String REFS_INDEX = "refsIdx";
+ private static final String[] EMPTY = new String[0];
+ private static final Pattern ADDRESS_CLEANUP = Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
+ private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
+ private static final Pattern SPACE_OR_CLOSE_ANGLE = Pattern.compile(">|\\s+");
private EmailUtility() {
-
}
/**
* Strip off some spurious characters that make it harder to dedup
- *
- * @param address
- * @return
*/
- public static String cleanUpEmailAddress(String address) {
+ public static String cleanUpEmailAddress(CharSequence address) {
//do some cleanup to normalize some things, like: Key: karthik ananth <ka...@gmail.com>: Value: 178
//Key: karthik ananth [mailto:karthik.jcecs@gmail.com]=20: Value: 179
//TODO: is there more to clean up here?
- address = address.replaceAll("mailto:|<|>|\\[|\\]|\\=20", "");
- return address;
+ return ADDRESS_CLEANUP.matcher(address).replaceAll("");
}
-
public static void loadDictionaries(Configuration conf, String fromPrefix,
OpenObjectIntHashMap<String> fromDictionary,
String msgIdPrefix,
@@ -53,8 +65,7 @@ public final class EmailUtility {
URI[] localFiles = DistributedCache.getCacheFiles(conf);
Preconditions.checkArgument(localFiles != null,
"missing paths from the DistributedCache");
- for (int i = 0; i < localFiles.length; i++) {
- URI localFile = localFiles[i];
+ for (URI localFile : localFiles) {
Path dictionaryFile = new Path(localFile.getPath());
// key is word value is id
@@ -66,7 +77,7 @@ public final class EmailUtility {
}
if (dictionary != null) {
for (Pair<Writable, IntWritable> record
- : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
+ : new SequenceFileIterable<Writable, IntWritable>(dictionaryFile, true, conf)) {
dictionary.put(record.getFirst().toString(), record.getSecond().get());
}
}
@@ -74,14 +85,12 @@ public final class EmailUtility {
}
- private static final String [] EMPTY = new String[0];
-
- public static String[] parseReferences(String rawRefs) {
- String[] splits = null;
+ public static String[] parseReferences(CharSequence rawRefs) {
+ String[] splits;
if (rawRefs != null && rawRefs.length() > 0) {
- splits = rawRefs.split(">|\\s+");
+ splits = SPACE_OR_CLOSE_ANGLE.split(rawRefs);
for (int i = 0; i < splits.length; i++) {
- splits[i] = splits[i].replaceAll("<|>", "");
+ splits[i] = ANGLE_BRACES.matcher(splits[i]).replaceAll("");
}
} else {
splits = EMPTY;
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java Sat Oct 15 14:08:33 2011
@@ -1,24 +1,38 @@
-package org.apache.mahout.cf.taste.example.email;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.example.email;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.math.VarIntWritable;
-import org.apache.mahout.math.VarLongWritable;
import java.io.IOException;
/**
* Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
- *
- **/
-public class FromEmailToDictionaryMapper extends
- Mapper<Text, Text, Text, VarIntWritable> {
- private String separator = "\n";
+ */
+public final class FromEmailToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+ private String separator;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToDictionaryReducer.java Sat Oct 15 14:08:33 2011
@@ -1,10 +1,25 @@
-package org.apache.mahout.cf.taste.example.email;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.example.email;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.mahout.math.VarIntWritable;
-import org.apache.mahout.math.VarLongWritable;
import java.io.IOException;
@@ -13,13 +28,12 @@ import java.io.IOException;
* Value: the count
* Out Key: the string id
* Out Value: the sum of the counts
- *
- **/
-public class MailToDictionaryReducer extends
- Reducer<Text, VarIntWritable, Text, VarIntWritable> {
+ */
+public final class MailToDictionaryReducer extends Reducer<Text, VarIntWritable, Text, VarIntWritable> {
@Override
- protected void reduce(Text key, Iterable<VarIntWritable> values, Context context) throws IOException, InterruptedException {
+ protected void reduce(Text key, Iterable<VarIntWritable> values, Context context)
+ throws IOException, InterruptedException {
int sum = 0;
for (VarIntWritable value : values) {
sum += value.get();
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.cf.taste.example.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +15,7 @@ package org.apache.mahout.cf.taste.examp
* limitations under the License.
*/
+package org.apache.mahout.cf.taste.example.email;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
@@ -64,20 +64,19 @@ import java.util.concurrent.atomic.Atomi
* <p/>
* It also outputs a side table mapping the row ids to their original and the message ids to the message thread id
*/
-public class MailToPrefsDriver extends AbstractJob {
+public final class MailToPrefsDriver extends AbstractJob {
+
private static final Logger log = LoggerFactory.getLogger(MailToPrefsDriver.class);
private static final String OUTPUT_FILES_PATTERN = "part-*";
private static final int DICTIONARY_BYTE_OVERHEAD = 4;
-
public static void main(String[] args) throws Exception {
ToolRunner.run(new Configuration(), new MailToPrefsDriver(), args);
}
@Override
public int run(String[] args) throws Exception {
- int result = 0;
addInputOption();
addOutputOption();
addOption(DefaultOptionCreator.overwriteOption().create());
@@ -99,9 +98,8 @@ public class MailToPrefsDriver extends A
AtomicInteger currentPhase = new AtomicInteger();
int[] msgDim = new int[1];
- int[] fromDim = new int[1];
//TODO: mod this to not do so many passes over the data. Dictionary creation could probably be a chain mapper
- List<Path> msgIdChunks = null, fromChunks = null;
+ List<Path> msgIdChunks = null;
boolean overwrite = hasOption(DefaultOptionCreator.OVERWRITE_OPTION);
// create the dictionary between message ids and longs
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
@@ -126,6 +124,7 @@ public class MailToPrefsDriver extends A
msgIdChunks = createDictionaryChunks(msgIdsPath, output, "msgIds-dictionary-", createMsgIdDictionary.getConfiguration(), chunkSize, msgDim);
}
//create the dictionary between from email addresses and longs
+ List<Path> fromChunks = null;
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Path fromIdsPath = new Path(output, "fromIds");
if (overwrite) {
@@ -145,6 +144,7 @@ public class MailToPrefsDriver extends A
createFromIdDictionary.getConfiguration().set(EmailUtility.SEPARATOR, separator);
createFromIdDictionary.waitForCompletion(true);
//write out the dictionary at the top level
+ int[] fromDim = new int[1];
fromChunks = createDictionaryChunks(fromIdsPath, output, "fromIds-dictionary-", createFromIdDictionary.getConfiguration(), chunkSize, fromDim);
}
//OK, we have our dictionaries, let's output the real thing we need: <from_id -> <msgId, msgId, msgId, ...>>
@@ -152,7 +152,7 @@ public class MailToPrefsDriver extends A
//Job map
//may be a way to do this so that we can load the from ids in memory, if they are small enough so that we don't need the double loop
log.info("Creating recommendation matrix");
- int i = 0, j = 0;
+ int i = 0;
Path vecPath = new Path(output, "recInput");
if (overwrite) {
HadoopUtil.delete(conf, vecPath);
@@ -164,9 +164,10 @@ public class MailToPrefsDriver extends A
conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
conf.set(EmailUtility.SEPARATOR, separator);
+ int j = 0;
for (Path fromChunk : fromChunks) {
for (Path idChunk : msgIdChunks) {
- Path out = new Path(vecPath, "tmp-" + i + "-" + j);
+ Path out = new Path(vecPath, "tmp-" + i + '-' + j);
DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), idChunk.toUri()}, conf);
Job createRecMatrix = prepareJob(input, out, SequenceFileInputFormat.class,
MailToRecMapper.class, NullWritable.class, Text.class,
@@ -175,10 +176,10 @@ public class MailToPrefsDriver extends A
createRecMatrix.waitForCompletion(true);
//copy the results up a level
//HadoopUtil.copyMergeSeqFiles(out.getFileSystem(conf), out, vecPath.getFileSystem(conf), outPath, true, conf, "");
- FileStatus fs[] = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null, conf);
+ FileStatus[] fs = HadoopUtil.getFileStatus(new Path(out, "*"), PathType.GLOB, PathFilters.partFilter(), null, conf);
for (int k = 0; k < fs.length; k++) {
FileStatus f = fs[k];
- Path outPath = new Path(vecPath, "chunk-" + i + "-" + j + "-" + k);
+ Path outPath = new Path(vecPath, "chunk-" + i + '-' + j + '-' + k);
FileUtil.copy(f.getPath().getFileSystem(conf), f.getPath(), outPath.getFileSystem(conf), outPath, true, overwrite, conf);
}
HadoopUtil.delete(conf, out);
@@ -195,7 +196,7 @@ public class MailToPrefsDriver extends A
//HadoopUtil.copyMergeSeqFiles(vecPath.getFileSystem(conf), vecPath, mergePath.getFileSystem(conf), mergePath, false, conf, "\n");
}
- return result;
+ return 0;
}
private static List<Path> createDictionaryChunks(Path inputPath,
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.cf.taste.example.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,31 +15,28 @@ package org.apache.mahout.cf.taste.examp
* limitations under the License.
*/
+package org.apache.mahout.cf.taste.example.email;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.mahout.math.VarIntWritable;
import org.apache.mahout.math.map.OpenObjectIntHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
-/**
- *
- *
- **/
-public class MailToRecMapper extends
- Mapper<Text, Text, NullWritable, Text> {
- private transient static Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
- private OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<String>();
- private OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<String>();
+public final class MailToRecMapper extends Mapper<Text, Text, NullWritable, Text> {
+
+ private static final Logger log = LoggerFactory.getLogger(MailToRecMapper.class);
+
+ private final OpenObjectIntHashMap<String> fromDictionary = new OpenObjectIntHashMap<String>();
+ private final OpenObjectIntHashMap<String> msgIdDictionary = new OpenObjectIntHashMap<String>();
private String separator = "\n";
- protected int fromIdx;
- protected int refsIdx;
+ private int fromIdx;
+ private int refsIdx;
public enum Counters {
REFERENCE, ORIGINAL
@@ -48,6 +44,7 @@ public class MailToRecMapper extends
@Override
protected void setup(Context context) throws IOException, InterruptedException {
+ super.setup(context);
Configuration conf = context.getConfiguration();
String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
@@ -61,7 +58,6 @@ public class MailToRecMapper extends
@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
- String msgId = null;
int msgIdKey = Integer.MIN_VALUE;
@@ -87,9 +83,9 @@ public class MailToRecMapper extends
if (msgIdKey == Integer.MIN_VALUE) {//we don't have any references, so use the msg id
//get the msg id and the from and output the associated ids
String keyStr = key.toString();
- int idx = keyStr.lastIndexOf("/");
+ int idx = keyStr.lastIndexOf('/');
if (idx != -1) {
- msgId = keyStr.substring(idx + 1);
+ String msgId = keyStr.substring(idx + 1);
msgIdKey = msgIdDictionary.get(msgId);
context.getCounter(Counters.ORIGINAL).increment(1);
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java Sat Oct 15 14:08:33 2011
@@ -1,5 +1,21 @@
-package org.apache.mahout.cf.taste.example.email;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.example.email;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
@@ -10,8 +26,8 @@ import java.io.IOException;
/**
* Assumes the input is in the format created by {@link org.apache.mahout.text.SequenceFilesFromMailArchives}
*/
-public class MsgIdToDictionaryMapper extends
- Mapper<Text, Text, Text, VarIntWritable> {
+public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, VarIntWritable> {
+
public enum Counters {
NO_MESSAGE_ID
}
@@ -20,13 +36,12 @@ public class MsgIdToDictionaryMapper ext
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
//message id is in the key: /201008/AANLkTikvVnhNH+Y5AGEwqd2=u0CFv2mCm0ce6E6oBnj1@mail.gmail.com
String keyStr = key.toString();
- int idx = keyStr.lastIndexOf("/");
- String msgId = null;
- if (idx != -1) {
- msgId = keyStr.substring(idx + 1);
- context.write(new Text(msgId), new VarIntWritable(1));
- } else {
+ int idx = keyStr.lastIndexOf('/');
+ if (idx == -1) {
context.getCounter(Counters.NO_MESSAGE_ID).increment(1);
+ } else {
+ String msgId = keyStr.substring(idx + 1);
+ context.write(new Text(msgId), new VarIntWritable(1));
}
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Sat Oct 15 14:08:33 2011
@@ -65,8 +65,8 @@ public class WikipediaDatasetCreatorMapp
String catMatch = findMatchingCategory(document);
if (!"Unknown".equals(catMatch)) {
StringBuilder contents = new StringBuilder(1000);
- document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
- WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
+ document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(
+ OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
TokenStream stream = analyzer.reusableTokenStream(catMatch, new StringReader(document));
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
stream.reset();
@@ -74,7 +74,7 @@ public class WikipediaDatasetCreatorMapp
contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
}
context.write(
- new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
+ new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
new Text(contents.toString()));
}
}
@@ -132,11 +132,12 @@ public class WikipediaDatasetCreatorMapp
// categories.add(category.toLowerCase());
if (exactMatchOnly && inputCategories.contains(category)) {
return category;
- } else if (!exactMatchOnly) {
+ }
+ if (!exactMatchOnly) {
for (int i = 0; i < inputCategories.size(); i++) {
String inputCategory = inputCategories.get(i);
Pattern inputCategoryPattern = inputCategoryPatterns.get(i);
- if (inputCategoryPattern.matcher(category).matches()) { // inexact match with word boundary.
+ if (inputCategoryPattern.matcher(category).matches()) { // inexact match with word boundary.
return inputCategory;
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailMapper.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.classifier.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,18 +15,26 @@ package org.apache.mahout.classifier.ema
* limitations under the License.
*/
+package org.apache.mahout.classifier.email;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.utils.email.MailProcessor;
import java.io.IOException;
+import java.util.Locale;
+import java.util.regex.Pattern;
/**
- * Convert the labels created by the {@link org.apache.mahout.utils.email.MailProcessor} to one consumable by the classifiers
+ * Convert the labels created by the {@link MailProcessor} to one consumable by the classifiers
*/
public class PrepEmailMapper extends Mapper<WritableComparable<?>, VectorWritable, Text, VectorWritable> {
+
+ private static final Pattern DASH_DOT = Pattern.compile("-|\\.");
+ private static final Pattern SLASH = Pattern.compile("\\/");
+
private boolean useListName = false;//if true, use the project name and the list name in label creation
@Override
protected void setup(Context context) throws IOException, InterruptedException {
@@ -35,18 +42,24 @@ public class PrepEmailMapper extends Map
}
@Override
- protected void map(WritableComparable<?> key, VectorWritable value, Context context) throws IOException, InterruptedException {
+ protected void map(WritableComparable<?> key, VectorWritable value, Context context)
+ throws IOException, InterruptedException {
String input = key.toString();
///Example: /cocoon.apache.org/dev/200307.gz/001401c3414f$8394e160$1e01a8c0@WRPO
- String[] splits = input.split("\\/");
+ String[] splits = SLASH.split(input);
//we need the first two splits;
if (splits.length >= 3) {
- StringBuilder bldr = new StringBuilder(splits[1].replaceAll("-|\\.", "_").toLowerCase());
- if (useListName == true) {
- bldr.append("_").append(splits[2].replaceAll("-|\\.", "_").toLowerCase());
+ StringBuilder bldr = new StringBuilder();
+ bldr.append(escape(splits[1]));
+ if (useListName) {
+ bldr.append('_').append(escape(splits[2]));
}
context.write(new Text(bldr.toString()), value);
}
}
+
+ private static String escape(CharSequence value) {
+ return DASH_DOT.matcher(value).replaceAll("_").toLowerCase(Locale.ENGLISH);
+ }
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/email/PrepEmailReducer.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.classifier.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +15,7 @@ package org.apache.mahout.classifier.ema
* limitations under the License.
*/
+package org.apache.mahout.classifier.email;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
@@ -24,19 +24,18 @@ import org.apache.mahout.math.VectorWrit
import java.io.IOException;
import java.util.Iterator;
-/**
- *
- *
- **/
public class PrepEmailReducer extends Reducer<Text, VectorWritable, Text, VectorWritable>{
- long maxItemsPerLabel = 10000;
+
+ private long maxItemsPerLabel = 10000;
+
@Override
protected void setup(Context context) throws IOException, InterruptedException {
maxItemsPerLabel = Long.parseLong(context.getConfiguration().get(PrepEmailVectorsDriver.ITEMS_PER_CLASS));
}
@Override
- protected void reduce(Text key, Iterable<VectorWritable> values, Context context) throws IOException, InterruptedException {
+ protected void reduce(Text key, Iterable<VectorWritable> values, Context context)
+ throws IOException, InterruptedException {
//TODO: support randomization? Likely not needed due to the SplitInput utility which does random selection
long i = 0;
Iterator<VectorWritable> iterator = values.iterator();
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sequencelearning/hmm/PosTagger.java Sat Oct 15 14:08:33 2011
@@ -211,7 +211,7 @@ public final class PosTagger {
}
private static void testModel(String testingURL) throws IOException {
- log.info("Reading and parsing test data file from URL:" + testingURL);
+ log.info("Reading and parsing test data file from URL: {}", testingURL);
long start = System.currentTimeMillis();
readFromURL(testingURL, false);
long end = System.currentTimeMillis();
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/AdaptiveLogisticModelParameters.java Sat Oct 15 14:08:33 2011
@@ -29,6 +29,7 @@ import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import com.google.common.io.Closeables;
@@ -61,8 +62,8 @@ public class AdaptiveLogisticModelParame
public void checkParameters() {
if (prior != null) {
- if ("TP".equals(prior.toUpperCase().trim()) ||
- "EBP".equals(prior.toUpperCase().trim())) {
+ if ("TP".equals(prior.toUpperCase(Locale.ENGLISH).trim()) ||
+ "EBP".equals(prior.toUpperCase(Locale.ENGLISH).trim())) {
if (Double.isNaN(priorOption)) {
throw new IllegalArgumentException("You must specify a double value for TPrior and ElasticBandPrior.");
}
@@ -74,19 +75,19 @@ public class AdaptiveLogisticModelParame
if (cmd == null) {
return null;
}
- if ("L1".equals(cmd.toUpperCase().trim())) {
+ if ("L1".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new L1();
}
- if ("L2".equals(cmd.toUpperCase().trim())) {
+ if ("L2".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new L2();
}
- if ("UP".equals(cmd.toUpperCase().trim())) {
+ if ("UP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new UniformPrior();
}
- if ("TP".equals(cmd.toUpperCase().trim())) {
+ if ("TP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new TPrior(priorOption);
}
- if ("EBP".equals(cmd.toUpperCase().trim())) {
+ if ("EBP".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new ElasticBandPrior(priorOption);
}
@@ -97,10 +98,10 @@ public class AdaptiveLogisticModelParame
if (cmd == null) {
return null;
}
- if ("GLOBAL".equals(cmd.toUpperCase().trim())) {
+ if ("GLOBAL".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new GlobalOnlineAuc();
}
- if ("GROUPED".equals(cmd.toUpperCase().trim())) {
+ if ("GROUPED".equals(cmd.toUpperCase(Locale.ENGLISH).trim())) {
return new GroupedOnlineAuc();
}
return null;
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/display/DisplayClustering.java Sat Oct 15 14:08:33 2011
@@ -123,7 +123,7 @@ public class DisplayClustering extends F
int cx = CLUSTERS.size() - 1;
for (List<Cluster> clusters : CLUSTERS) {
g2.setStroke(new BasicStroke(cx == 0 ? 3 : 1));
- g2.setColor(COLORS[Math.min(DisplayClustering.COLORS.length - 1, cx--)]);
+ g2.setColor(COLORS[Math.min(COLORS.length - 1, cx--)]);
for (Cluster cluster : clusters) {
plotEllipse(g2, cluster.getCenter(), cluster.getRadius().times(3));
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/kmeans/Job.java Sat Oct 15 14:08:33 2011
@@ -57,8 +57,7 @@ public final class Job extends AbstractJ
Path output = new Path("output");
Configuration conf = new Configuration();
HadoopUtil.delete(conf, output);
- new Job().run(conf, new Path("testdata"), output,
- new EuclideanDistanceMeasure(), 6, 0.5, 10);
+ run(conf, new Path("testdata"), output, new EuclideanDistanceMeasure(), 6, 0.5, 10);
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/df/mapreduce/BuildForest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/df/mapreduce/BuildForest.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/df/mapreduce/BuildForest.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/df/mapreduce/BuildForest.java Sat Oct 15 14:08:33 2011
@@ -186,7 +186,7 @@ public class BuildForest extends Configu
// store the decision forest in the output path
Path forestPath = new Path(outputPath, "forest.seq");
- log.info("Storing the forest in: " + forestPath);
+ log.info("Storing the forest in: {}", forestPath);
DFUtils.storeWritable(getConf(), forestPath, forest);
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/travellingsalesman/EuropeanDistanceLookup.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/travellingsalesman/EuropeanDistanceLookup.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/travellingsalesman/EuropeanDistanceLookup.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/travellingsalesman/EuropeanDistanceLookup.java Sat Oct 15 14:08:33 2011
@@ -52,7 +52,7 @@ public final class EuropeanDistanceLooku
amsterdam.put("Rome", 1304);
amsterdam.put("Stockholm", 1132);
amsterdam.put("Vienna", 938);
- EuropeanDistanceLookup.DISTANCES.put("Amsterdam", amsterdam);
+ DISTANCES.put("Amsterdam", amsterdam);
Map<String,Integer> athens = Maps.newHashMapWithExpectedSize(20);
athens.put("Amsterdam", 2162);
@@ -70,7 +70,7 @@ public final class EuropeanDistanceLooku
athens.put("Rome", 1040);
athens.put("Stockholm", 2410);
athens.put("Vienna", 1280);
- EuropeanDistanceLookup.DISTANCES.put("Athens", athens);
+ DISTANCES.put("Athens", athens);
Map<String,Integer> berlin = Maps.newHashMapWithExpectedSize(20);
berlin.put("Amsterdam", 576);
@@ -88,7 +88,7 @@ public final class EuropeanDistanceLooku
berlin.put("Rome", 1185);
berlin.put("Stockholm", 818);
berlin.put("Vienna", 525);
- EuropeanDistanceLookup.DISTANCES.put("Berlin", berlin);
+ DISTANCES.put("Berlin", berlin);
Map<String,Integer> brussels = Maps.newHashMapWithExpectedSize(20);
brussels.put("Amsterdam", 171);
@@ -106,7 +106,7 @@ public final class EuropeanDistanceLooku
brussels.put("Rome", 1182);
brussels.put("Stockholm", 1284);
brussels.put("Vienna", 917);
- EuropeanDistanceLookup.DISTANCES.put("Brussels", brussels);
+ DISTANCES.put("Brussels", brussels);
Map<String,Integer> copenhagen = Maps.newHashMapWithExpectedSize(20);
copenhagen.put("Amsterdam", 622);
@@ -124,7 +124,7 @@ public final class EuropeanDistanceLooku
copenhagen.put("Rome", 1540);
copenhagen.put("Stockholm", 526);
copenhagen.put("Vienna", 876);
- EuropeanDistanceLookup.DISTANCES.put("Copenhagen", copenhagen);
+ DISTANCES.put("Copenhagen", copenhagen);
Map<String,Integer> dublin = Maps.newHashMapWithExpectedSize(20);
dublin.put("Amsterdam", 757);
@@ -142,7 +142,7 @@ public final class EuropeanDistanceLooku
dublin.put("Rome", 1903);
dublin.put("Stockholm", 1625);
dublin.put("Vienna", 1687);
- EuropeanDistanceLookup.DISTANCES.put("Dublin", dublin);
+ DISTANCES.put("Dublin", dublin);
Map<String,Integer> helsinki = Maps.newHashMapWithExpectedSize(20);
helsinki.put("Amsterdam", 1506);
@@ -160,7 +160,7 @@ public final class EuropeanDistanceLooku
helsinki.put("Rome", 2202);
helsinki.put("Stockholm", 396);
helsinki.put("Vienna", 1439);
- EuropeanDistanceLookup.DISTANCES.put("Helsinki", helsinki);
+ DISTANCES.put("Helsinki", helsinki);
Map<String,Integer> lisbon = Maps.newHashMapWithExpectedSize(20);
lisbon.put("Amsterdam", 1861);
@@ -178,7 +178,7 @@ public final class EuropeanDistanceLooku
lisbon.put("Rome", 1873);
lisbon.put("Stockholm", 2993);
lisbon.put("Vienna", 2300);
- EuropeanDistanceLookup.DISTANCES.put("Lisbon", lisbon);
+ DISTANCES.put("Lisbon", lisbon);
Map<String,Integer> london = Maps.newHashMapWithExpectedSize(20);
london.put("Amsterdam", 356);
@@ -196,7 +196,7 @@ public final class EuropeanDistanceLooku
london.put("Rome", 1444);
london.put("Stockholm", 1436);
london.put("Vienna", 1237);
- EuropeanDistanceLookup.DISTANCES.put("London", london);
+ DISTANCES.put("London", london);
Map<String,Integer> luxembourg = Maps.newHashMapWithExpectedSize(20);
luxembourg.put("Amsterdam", 318);
@@ -214,7 +214,7 @@ public final class EuropeanDistanceLooku
luxembourg.put("Rome", 995);
luxembourg.put("Stockholm", 1325);
luxembourg.put("Vienna", 761);
- EuropeanDistanceLookup.DISTANCES.put("Luxembourg", luxembourg);
+ DISTANCES.put("Luxembourg", luxembourg);
Map<String,Integer> madrid = Maps.newHashMapWithExpectedSize(20);
madrid.put("Amsterdam", 1477);
@@ -232,7 +232,7 @@ public final class EuropeanDistanceLooku
madrid.put("Rome", 1377);
madrid.put("Stockholm", 2596);
madrid.put("Vienna", 1812);
- EuropeanDistanceLookup.DISTANCES.put("Madrid", madrid);
+ DISTANCES.put("Madrid", madrid);
Map<String,Integer> paris = Maps.newHashMapWithExpectedSize(20);
paris.put("Amsterdam", 429);
@@ -250,7 +250,7 @@ public final class EuropeanDistanceLooku
paris.put("Rome", 1117);
paris.put("Stockholm", 1549);
paris.put("Vienna", 1037);
- EuropeanDistanceLookup.DISTANCES.put("Paris", paris);
+ DISTANCES.put("Paris", paris);
Map<String,Integer> rome = Maps.newHashMapWithExpectedSize(20);
rome.put("Amsterdam", 1304);
@@ -268,7 +268,7 @@ public final class EuropeanDistanceLooku
rome.put("Rome", 0);
rome.put("Stockholm", 1984);
rome.put("Vienna", 765);
- EuropeanDistanceLookup.DISTANCES.put("Rome", rome);
+ DISTANCES.put("Rome", rome);
Map<String,Integer> stockholm = Maps.newHashMapWithExpectedSize(20);
stockholm.put("Amsterdam", 1132);
@@ -286,7 +286,7 @@ public final class EuropeanDistanceLooku
stockholm.put("Rome", 1984);
stockholm.put("Stockholm", 0);
stockholm.put("Vienna", 1247);
- EuropeanDistanceLookup.DISTANCES.put("Stockholm", stockholm);
+ DISTANCES.put("Stockholm", stockholm);
Map<String,Integer> vienna = Maps.newHashMapWithExpectedSize(20);
vienna.put("Amsterdam", 938);
@@ -304,18 +304,18 @@ public final class EuropeanDistanceLooku
vienna.put("Rome", 765);
vienna.put("Stockholm", 1247);
vienna.put("Vienna", 0);
- EuropeanDistanceLookup.DISTANCES.put("Vienna", vienna);
+ DISTANCES.put("Vienna", vienna);
}
@Override
public List<String> getKnownCities() {
- List<String> cities = Lists.newArrayList(EuropeanDistanceLookup.DISTANCES.keySet());
+ List<String> cities = Lists.newArrayList(DISTANCES.keySet());
Collections.sort(cities);
return cities;
}
@Override
public int getDistance(String startingCity, String destinationCity) {
- return EuropeanDistanceLookup.DISTANCES.get(startingCity).get(destinationCity);
+ return DISTANCES.get(startingCity).get(destinationCity);
}
}
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/text/WikipediaMapper.java Sat Oct 15 14:08:33 2011
@@ -128,7 +128,8 @@ public class WikipediaMapper extends Map
String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
if (exactMatchOnly && inputCategories.contains(category)) {
return category;
- } else if (!exactMatchOnly) {
+ }
+ if (!exactMatchOnly) {
for (String inputCategory : inputCategories) {
if (category.contains(inputCategory)) { // we have an inexact match
return inputCategory;
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/jdbc/AbstractJDBCDataModel.java Sat Oct 15 14:08:33 2011
@@ -190,8 +190,7 @@ public abstract class AbstractJDBCDataMo
AbstractJDBCComponent.checkNotNullAndLog("getMinPreferenceSQL", getMinPreferenceSQL);
if (!(dataSource instanceof ConnectionPoolDataSource)) {
- AbstractJDBCDataModel.log
- .warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
+ log.warn("You are not using ConnectionPoolDataSource. Make sure your DataSource pools connections "
+ "to the database itself, or database performance will be severely reduced.");
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/cf/taste/impl/model/mongodb/MongoDBDataModel.java Sat Oct 15 14:08:33 2011
@@ -708,7 +708,8 @@ public final class MongoDBDataModel impl
private Date getDate(Object date) {
if (date.getClass().getName().contains("Date")) {
return (Date) date;
- } else if (date.getClass().getName().contains("String")) {
+ }
+ if (date.getClass().getName().contains("String")) {
try {
synchronized (dateFormat) {
return dateFormat.parse(date.toString());
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputDriver.java Sat Oct 15 14:08:33 2011
@@ -84,7 +84,7 @@ public final class InputDriver {
"org.apache.mahout.math.RandomAccessSparseVector").toString();
runJob(input, output, vectorClassName);
} catch (OptionException e) {
- InputDriver.log.error("Exception parsing command line: ", e);
+ log.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/InputMapper.java Sat Oct 15 14:08:33 2011
@@ -40,7 +40,7 @@ public class InputMapper extends Mapper<
@Override
protected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {
- String[] numbers = InputMapper.SPACE.split(values.toString());
+ String[] numbers = SPACE.split(values.toString());
// sometimes there are multiple separator spaces
Collection<Double> doubles = Lists.newArrayList();
for (String value : numbers) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputDriver.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputDriver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputDriver.java Sat Oct 15 14:08:33 2011
@@ -72,7 +72,7 @@ public final class InputDriver {
Path output = new Path(cmdLine.getValue(outputOpt, "output").toString());
runJob(input, output);
} catch (OptionException e) {
- InputDriver.log.error("Exception parsing command line: ", e);
+ log.error("Exception parsing command line: ", e);
CommandLineUtil.printHelp(group);
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputMapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/conversion/meanshift/InputMapper.java Sat Oct 15 14:08:33 2011
@@ -38,7 +38,7 @@ public class InputMapper extends Mapper<
@Override
protected void map(LongWritable key, Text values, Context context) throws IOException, InterruptedException {
- String[] numbers = InputMapper.SPACE.split(values.toString());
+ String[] numbers = SPACE.split(values.toString());
// sometimes there are multiple separator spaces
Collection<Double> doubles = Lists.newArrayList();
for (String value : numbers) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/clustering/evaluation/ClusterEvaluator.java Sat Oct 15 14:08:33 2011
@@ -125,7 +125,7 @@ public class ClusterEvaluator {
for (Iterator<Cluster> it = clusters.iterator(); it.hasNext();) {
Cluster cluster = it.next();
if (invalidCluster(cluster)) {
- log.info("Pruning cluster Id=" + cluster.getId());
+ log.info("Pruning cluster Id={}", cluster.getId());
it.remove();
representativePoints.remove(cluster.getId());
}
@@ -156,7 +156,7 @@ public class ClusterEvaluator {
}
}
double density = (sum / count - min) / (max - min);
- log.info("Inter-Cluster Density = " + density);
+ log.info("Inter-Cluster Density = {}", density);
return density;
}
@@ -186,10 +186,10 @@ public class ClusterEvaluator {
}
double density = (sum / count - min) / (max - min);
avgDensity += density;
- log.info("Intra-Cluster Density[" + cluster.getId() + "] = " + density);
+ log.info("Intra-Cluster Density[{}] = {}", cluster.getId(), density);
}
avgDensity = clusters.isEmpty() ? 0 : avgDensity / clusters.size();
- log.info("Intra-Cluster Density = " + avgDensity);
+ log.info("Intra-Cluster Density = {}", avgDensity);
return avgDensity;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Sat Oct 15 14:08:33 2011
@@ -17,7 +17,6 @@
package org.apache.mahout.text;
-import java.io.IOException;
import java.lang.reflect.Constructor;
import java.nio.charset.Charset;
import java.util.Map;
@@ -123,7 +122,7 @@ public class SequenceFilesFromDirectory
* Override this method in order to parse your additional options from the command line. Do not forget to call
* super() otherwise standard options (input/output dirs etc) will not be available.
*/
- protected Map<String, String> parseOptions() throws IOException {
+ protected Map<String, String> parseOptions() {
Map<String, String> options = Maps.newHashMap();
options.put(CHUNK_SIZE_OPTION[0], getOption(CHUNK_SIZE_OPTION[0]));
options.put(FILE_FILTER_CLASS_OPTION[0], getOption(FILE_FILTER_CLASS_OPTION[0]));
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SplitInput.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SplitInput.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SplitInput.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/SplitInput.java Sat Oct 15 14:08:33 2011
@@ -228,7 +228,8 @@ public class SplitInput {
if (cmdLine.hasOption(testSplitSizeOpt) && cmdLine.hasOption(testSplitPctOpt)) {
throw new OptionException(testSplitSizeOpt, "must have either split size or split percentage option, not BOTH");
- } else if (!cmdLine.hasOption(testSplitSizeOpt) && !cmdLine.hasOption(testSplitPctOpt) && !cmdLine.hasOption(randomSelectionPctOpt) && !cmdLine.hasOption(randomSelectionSizeOpt)) {
+ }
+ if (!cmdLine.hasOption(testSplitSizeOpt) && !cmdLine.hasOption(testSplitPctOpt) && !cmdLine.hasOption(randomSelectionPctOpt) && !cmdLine.hasOption(randomSelectionSizeOpt)) {
throw new OptionException(testSplitSizeOpt, "must set one of test split size/percentage or randomSelectionSize/percentage");
}
@@ -280,7 +281,8 @@ public class SplitInput {
public void splitDirectory(Path inputDir) throws IOException {
if (fs.getFileStatus(inputDir) == null) {
throw new IOException(inputDir + " does not exist");
- } else if (!fs.getFileStatus(inputDir).isDir()) {
+ }
+ if (!fs.getFileStatus(inputDir).isDir()) {
throw new IOException(inputDir + " is not a directory");
}
@@ -301,7 +303,8 @@ public class SplitInput {
public void splitFile(Path inputFile) throws IOException {
if (fs.getFileStatus(inputFile) == null) {
throw new IOException(inputFile + " does not exist");
- } else if (fs.getFileStatus(inputFile).isDir()) {
+ }
+ if (fs.getFileStatus(inputFile).isDir()) {
throw new IOException(inputFile + " is a directory");
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/ClusterDumper.java Sat Oct 15 14:08:33 2011
@@ -50,7 +50,12 @@ import java.util.Map;
import java.util.TreeMap;
public final class ClusterDumper extends AbstractJob {
- public enum OUTPUT_FORMAT{TEXT, CSV, GRAPH_ML};
+
+ public enum OUTPUT_FORMAT {
+ TEXT,
+ CSV,
+ GRAPH_ML,
+ }
public static final String OUTPUT_OPTION = "output";
public static final String DICTIONARY_TYPE_OPTION = "dictionaryType";
@@ -172,25 +177,19 @@ public final class ClusterDumper extends
}
}
- protected ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException {
+ ClusterWriter createClusterWriter(Writer writer, String[] dictionary) throws IOException {
ClusterWriter result = null;
switch (outputFormat){
- case TEXT:{
+ case TEXT:
result = new ClusterDumperWriter(writer, clusterIdToPoints, numTopFeatures, dictionary, subString);
break;
- }
- case CSV:{
+ case CSV:
result = new CSVClusterWriter(writer, clusterIdToPoints);
break;
- }
- case GRAPH_ML:{
+ case GRAPH_ML:
result = new GraphMLClusterWriter(writer, clusterIdToPoints);
break;
- }
- default:{
- break;
- }
}
return result;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/clustering/GraphMLClusterWriter.java Sat Oct 15 14:08:33 2011
@@ -1,25 +1,43 @@
-package org.apache.mahout.utils.clustering;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.utils.clustering;
import org.apache.mahout.clustering.Cluster;
import org.apache.mahout.clustering.WeightedVectorWritable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.vectors.io.AbstractClusterWriter;
-import org.apache.mahout.utils.vectors.io.ClusterWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.List;
import java.util.Map;
+import java.util.regex.Pattern;
/**
* GraphML -- see http://gephi.org/users/supported-graph-formats/graphml-format/
- *
- **/
-public class GraphMLClusterWriter extends AbstractClusterWriter implements ClusterWriter {
+ */
+public class GraphMLClusterWriter extends AbstractClusterWriter {
+
+ private static final Pattern VEC_PATTERN = Pattern.compile("\\{|\\:|\\,|\\}");
- public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints) throws IOException {
+ public GraphMLClusterWriter(Writer writer, Map<Integer, List<WeightedVectorWritable>> clusterIdToPoints)
+ throws IOException {
super(writer, clusterIdToPoints);
writer.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
writer.append("<graphml xmlns=\"http://graphml.graphdrawing.org/xmlns\"\n" +
@@ -28,25 +46,26 @@ public class GraphMLClusterWriter extend
"http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd\">");
writer.append("<graph edgedefault=\"undirected\">");
}
- /*
-<?xml version="1.0" encoding="UTF-8"?>
-<graphml xmlns="http://graphml.graphdrawing.org/xmlns"
-xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns
-http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
-<graph id="G" edgedefault="undirected">
-<node id="n0"/>
-<node id="n1"/>
-<edge id="e1" source="n0" target="n1"/>
-</graph>
-</graphml>
+ /*
+ <?xml version="1.0" encoding="UTF-8"?>
+ <graphml xmlns="http://graphml.graphdrawing.org/xmlns"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns
+ http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
+ <graph id="G" edgedefault="undirected">
+ <node id="n0"/>
+ <node id="n1"/>
+ <edge id="e1" source="n0" target="n1"/>
+ </graph>
+ </graphml>
*/
+
@Override
public void write(Cluster cluster) throws IOException {
StringBuilder line = new StringBuilder();
line.append(createNode(String.valueOf(cluster.getId())));
- List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
+ List<WeightedVectorWritable> points = getClusterIdToPoints().get(cluster.getId());
if (points != null) {
for (WeightedVectorWritable point : points) {
Vector theVec = point.getVector();
@@ -57,26 +76,26 @@ http://graphml.graphdrawing.org/xmlns/1.
} else {
vecStr = theVec.asFormatString();
//do some basic manipulations for display
- vecStr = vecStr.replaceAll("\\{|\\:|\\,|\\}", "_");
+ vecStr = VEC_PATTERN.matcher(vecStr).replaceAll("_");
line.append(createNode(vecStr));
}
line.append(createEdge(String.valueOf(cluster.getId()), vecStr));
}
- writer.append(line).append("\n");
+ getWriter().append(line).append("\n");
}
}
- private String createEdge(String left, String right) {
- return "<edge id=\"" + left + "_" + right + "\" source=\"" + left + "\" target=\"" + right + "\"/>";
+ private static String createEdge(String left, String right) {
+ return "<edge id=\"" + left + '_' + right + "\" source=\"" + left + "\" target=\"" + right + "\"/>";
}
- private String createNode(String s) {
+ private static String createNode(String s) {
return "<node id=\"" + s + "\"/>";
}
@Override
public void close() throws IOException {
- writer.append("</graph>");
+ getWriter().append("</graph>");
super.close();
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.utils.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,32 +15,110 @@ package org.apache.mahout.utils.email;
* limitations under the License.
*/
-
-
+package org.apache.mahout.utils.email;
import java.io.File;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.regex.Pattern;
-/**
-*
-*
-**/
public class MailOptions {
+
public static final String FROM = "FROM";
public static final String TO = "TO";
public static final String REFS = "REFS";
public static final String SUBJECT = "SUBJECT";
- public File input;
- public String outputDir;
- public String prefix;
- public int chunkSize;
- public Charset charset;
- public String separator;
- public String bodySeparator = "\n";
- public boolean includeBody;
- public Pattern[] patternsToMatch;
+
+ private File input;
+ private String outputDir;
+ private String prefix;
+ private int chunkSize;
+ private Charset charset;
+ private String separator;
+ private String bodySeparator = "\n";
+ private boolean includeBody;
+ private Pattern[] patternsToMatch;
//maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in patternsToMatch. See MailToRecMapper
- public Map<String, Integer> patternOrder;
+ private Map<String, Integer> patternOrder;
+
+ public File getInput() {
+ return input;
+ }
+
+ public void setInput(File input) {
+ this.input = input;
+ }
+
+ public String getOutputDir() {
+ return outputDir;
+ }
+
+ public void setOutputDir(String outputDir) {
+ this.outputDir = outputDir;
+ }
+
+ public String getPrefix() {
+ return prefix;
+ }
+
+ public void setPrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ public int getChunkSize() {
+ return chunkSize;
+ }
+
+ public void setChunkSize(int chunkSize) {
+ this.chunkSize = chunkSize;
+ }
+
+ public Charset getCharset() {
+ return charset;
+ }
+
+ public void setCharset(Charset charset) {
+ this.charset = charset;
+ }
+
+ public String getSeparator() {
+ return separator;
+ }
+
+ public void setSeparator(String separator) {
+ this.separator = separator;
+ }
+
+ public String getBodySeparator() {
+ return bodySeparator;
+ }
+
+ public void setBodySeparator(String bodySeparator) {
+ this.bodySeparator = bodySeparator;
+ }
+
+ public boolean isIncludeBody() {
+ return includeBody;
+ }
+
+ public void setIncludeBody(boolean includeBody) {
+ this.includeBody = includeBody;
+ }
+
+ public Pattern[] getPatternsToMatch() {
+ return patternsToMatch;
+ }
+
+ public void setPatternsToMatch(Pattern[] patternsToMatch) {
+ this.patternsToMatch = patternsToMatch;
+ }
+
+ public Map<String, Integer> getPatternOrder() {
+ return patternOrder;
+ }
+
+ public void setPatternOrder(Map<String, Integer> patternOrder) {
+ this.patternOrder = patternOrder;
+ }
+
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.utils.email;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,6 +15,7 @@ package org.apache.mahout.utils.email;
* limitations under the License.
*/
+package org.apache.mahout.utils.email;
import org.apache.mahout.common.iterator.FileLineIterable;
import org.apache.mahout.utils.io.ChunkedWriter;
@@ -30,10 +30,6 @@ import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-/**
- *
- *
- **/
public class MailProcessor {
private static final Pattern MESSAGE_START =
Pattern.compile("^From \\S+@\\S.*\\d{4}$", Pattern.CASE_INSENSITIVE);
@@ -48,9 +44,9 @@ public class MailProcessor {
Pattern.compile("^references: (.*)$", Pattern.CASE_INSENSITIVE);
public static final Pattern TO_PREFIX =
Pattern.compile("^to: (.*)$", Pattern.CASE_INSENSITIVE);
- private String prefix;
- private MailOptions options;
- private WrappedWriter writer;
+ private final String prefix;
+ private final MailOptions options;
+ private final WrappedWriter writer;
public MailProcessor(MailOptions options, String prefix, Writer writer) {
this.writer = new IOWriterWrapper(writer);
@@ -70,17 +66,17 @@ public class MailProcessor {
StringBuilder contents = new StringBuilder();
// tmps used during mail message parsing
StringBuilder body = new StringBuilder();
- String messageId = null;
- boolean inBody = false;
Matcher messageIdMatcher = MESSAGE_ID_PREFIX.matcher("");
Matcher messageBoundaryMatcher = MESSAGE_START.matcher("");
- String[] patternResults = new String[options.patternsToMatch.length];
- Matcher[] matchers = new Matcher[options.patternsToMatch.length];
+ String[] patternResults = new String[options.getPatternsToMatch().length];
+ Matcher[] matchers = new Matcher[options.getPatternsToMatch().length];
for (int i = 0; i < matchers.length; i++) {
- matchers[i] = options.patternsToMatch[i].matcher("");
+ matchers[i] = options.getPatternsToMatch()[i].matcher("");
}
- for (String nextLine : new FileLineIterable(mboxFile, options.charset, false)) {
+ String messageId = null;
+ boolean inBody = false;
+ for (String nextLine : new FileLineIterable(mboxFile, options.getCharset(), false)) {
for (int i = 0; i < matchers.length; i++) {
Matcher matcher = matchers[i];
matcher.reset(nextLine);
@@ -97,7 +93,7 @@ public class MailProcessor {
// done parsing this message ... write it out
String key = generateKey(mboxFile, prefix, messageId);
//if this ordering changes, then also change FromEmailToDictionaryMapper
- writeContent(options.separator, contents, body, patternResults);
+ writeContent(options.getSeparator(), contents, body, patternResults);
writer.write(key, contents.toString());
contents.setLength(0); // reset the buffer
body.setLength(0);
@@ -105,9 +101,9 @@ public class MailProcessor {
messageId = null;
inBody = false;
} else {
- if (inBody && options.includeBody) {
+ if (inBody && options.isIncludeBody()) {
if (nextLine.length() > 0) {
- body.append(nextLine).append(options.bodySeparator);
+ body.append(nextLine).append(options.getBodySeparator());
}
} else {
// first empty line we see after reading the message Id
@@ -128,7 +124,7 @@ public class MailProcessor {
// write the last message in the file if available
if (messageId != null) {
String key = generateKey(mboxFile, prefix, messageId);
- writeContent(options.separator, contents, body, patternResults);
+ writeContent(options.getSeparator(), contents, body, patternResults);
writer.write(key, contents.toString());
contents.setLength(0); // reset the buffer
}
@@ -139,7 +135,7 @@ public class MailProcessor {
return messageCount;
}
- protected String generateKey(File mboxFile, String prefix, String messageId) {
+ protected static String generateKey(File mboxFile, String prefix, String messageId) {
return prefix + File.separator + mboxFile.getName() + File.separator + messageId;
}
@@ -151,11 +147,10 @@ public class MailProcessor {
return options;
}
- private void writeContent(String separator, StringBuilder contents, StringBuilder body, String[] matches) {
- for (int i = 0; i < matches.length; i++) {
- String match = matches[i];
+ private static void writeContent(String separator, StringBuilder contents, CharSequence body, String[] matches) {
+ for (String match : matches) {
contents.append(match).append(separator);
}
- contents.append("\n").append(body);
+ contents.append('\n').append(body);
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/ChunkedWrapper.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.utils.io;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,15 +15,13 @@ package org.apache.mahout.utils.io;
* limitations under the License.
*/
+package org.apache.mahout.utils.io;
import java.io.IOException;
-/**
-*
-*
-**/
-public class ChunkedWrapper extends WrappedWriter {
- ChunkedWriter writer;
+public class ChunkedWrapper implements WrappedWriter {
+
+ private final ChunkedWriter writer;
public ChunkedWrapper(ChunkedWriter writer) {
this.writer = writer;
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/IOWriterWrapper.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.utils.io;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,16 +15,14 @@ package org.apache.mahout.utils.io;
* limitations under the License.
*/
+package org.apache.mahout.utils.io;
import java.io.IOException;
import java.io.Writer;
-/**
-*
-*
-**/
-public class IOWriterWrapper extends WrappedWriter {
- Writer writer;
+public class IOWriterWrapper implements WrappedWriter {
+
+ private final Writer writer;
public IOWriterWrapper(Writer writer) {
this.writer = writer;
@@ -33,7 +30,7 @@ public class IOWriterWrapper extends Wra
@Override
public void write(String key, String value) throws IOException {
- writer.write(key + " " + value);
+ writer.write(key + ' ' + value);
}
@Override
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/io/WrappedWriter.java Sat Oct 15 14:08:33 2011
@@ -1,4 +1,3 @@
-package org.apache.mahout.utils.io;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -16,17 +15,16 @@ package org.apache.mahout.utils.io;
* limitations under the License.
*/
+package org.apache.mahout.utils.io;
import java.io.Closeable;
import java.io.IOException;
/**
-* Convenience class for wrapping either a java.io.Writer or a SequenceFile.Writer with some basic functionality
-*
-**/
-public abstract class WrappedWriter implements Closeable {
- public abstract void write(String key, String value) throws IOException;
+ * Convenience class for wrapping either a java.io.Writer or a SequenceFile.Writer with some basic functionality
+ */
+public interface WrappedWriter extends Closeable {
+
+ void write(String key, String value) throws IOException;
- @Override
- public abstract void close() throws IOException;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sat Oct 15 14:08:33 2011
@@ -119,7 +119,7 @@ public final class VectorHelper {
if (line.startsWith("#")) {
continue;
}
- String[] tokens = VectorHelper.TAB_PATTERN.split(line);
+ String[] tokens = TAB_PATTERN.split(line);
if (tokens.length < 3) {
continue;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterable.java Sat Oct 15 14:08:33 2011
@@ -102,7 +102,7 @@ public class ARFFVectorIterable implemen
type = ARFFType.NOMINAL;
//@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}
int classIdx = lower.indexOf(ARFFType.NOMINAL.getIndicator());
- String [] classes = ARFFVectorIterable.COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
+ String[] classes = COMMA_PATTERN.split(line.substring(classIdx + 1, line.length() - 1));
for (int i = 0; i < classes.length; i++) {
model.addNominal(label, classes[i].trim(), i);
}
@@ -113,7 +113,7 @@ public class ARFFVectorIterable implemen
//TODO: DateFormatter map
DateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss", Locale.ENGLISH);
int idx = lower.indexOf(ARFFType.DATE.getIndicator());
- String[] split = ARFFVectorIterable.SPACE_PATTERN.split(line);
+ String[] split = SPACE_PATTERN.split(line);
if (split.length >= 4) { //we have a date format
String formStr = line.substring(idx + ARFFType.DATE.getIndicator().length()).trim();
if (formStr.startsWith("\"")) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java?rev=1183642&r1=1183641&r2=1183642&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/MapBackedARFFModel.java Sat Oct 15 14:08:33 2011
@@ -84,7 +84,7 @@ public class MapBackedARFFModel implemen
@Override
public double getValue(String data, int idx) {
ARFFType type = typeMap.get(idx);
- data = MapBackedARFFModel.QUOTE_PATTERN.matcher(data).replaceAll("");
+ data = QUOTE_PATTERN.matcher(data).replaceAll("");
data = data.trim();
double result;
switch (type) {
@@ -127,7 +127,7 @@ public class MapBackedARFFModel implemen
// Not sure how scalable this is going to be
protected double processString(String data) {
- data = MapBackedARFFModel.QUOTE_PATTERN.matcher(data).replaceAll("");
+ data = QUOTE_PATTERN.matcher(data).replaceAll("");
// map it to an long
Long theLong = words.get(data);
if (theLong == null) {