You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/07/05 09:18:32 UTC
svn commit: r1142907 - in /mahout/trunk:
core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/
core/src/main/java/org/apache/mahout/classifier/sgd/
core/src/main/java/org/apache/mahout/common/nlp/
core/src/main/java/org/apache/mahout/...
Author: srowen
Date: Tue Jul 5 07:18:31 2011
New Revision: 1142907
URL: http://svn.apache.org/viewvc?rev=1142907&view=rev
Log:
Harmonize some splitter / patterns to split on space / tab, in Bayes
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/bayes/mapreduce/common/BayesFeatureMapper.java Tue Jul 5 07:18:31 2011
@@ -47,7 +47,7 @@ public class BayesFeatureMapper extends
private static final Logger log = LoggerFactory.getLogger(BayesFeatureMapper.class);
private static final DoubleWritable ONE = new DoubleWritable(1.0);
- private static final Pattern SPACE_PATTERN = Pattern.compile("[ ]+");
+ private static final Pattern SPACE_TAB = Pattern.compile("[ \t]+");
private int gramSize = 1;
@@ -73,7 +73,7 @@ public class BayesFeatureMapper extends
final OutputCollector<StringTuple,DoubleWritable> output,
Reporter reporter) throws IOException {
final String label = key.toString();
- String[] tokens = SPACE_PATTERN.split(value.toString());
+ String[] tokens = SPACE_TAB.split(value.toString());
OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize);
if (gramSize > 1) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/classifier/sgd/CsvRecordFactory.java Tue Jul 5 07:18:31 2011
@@ -70,7 +70,7 @@ public class CsvRecordFactory implements
// crude CSV value splitter. This will fail if any double quoted strings have
// commas inside. Also, escaped quotes will not be unescaped. Good enough for now.
- private final Splitter onComma = Splitter.on(",").trimResults(CharMatcher.is('"'));
+ private final Splitter COMMA = Splitter.on(',').trimResults(CharMatcher.is('"'));
private static final Map<String, Class<? extends FeatureVectorEncoder>> TYPE_DICTIONARY =
ImmutableMap.<String, Class<? extends FeatureVectorEncoder>>builder()
@@ -157,7 +157,7 @@ public class CsvRecordFactory implements
public void firstLine(String line) {
// read variable names, build map of name -> column
final Map<String, Integer> vars = Maps.newHashMap();
- variableNames = Lists.newArrayList(onComma.split(line));
+ variableNames = Lists.newArrayList(COMMA.split(line));
int column = 0;
for (String var : variableNames) {
vars.put(var, column++);
@@ -226,7 +226,7 @@ public class CsvRecordFactory implements
*/
@Override
public int processLine(String line, Vector featureVector) {
- List<String> values = Lists.newArrayList(onComma.split(line));
+ List<String> values = Lists.newArrayList(COMMA.split(line));
int targetValue = targetDictionary.intern(values.get(target));
if (targetValue >= maxTargetValue) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/nlp/NGrams.java Tue Jul 5 07:18:31 2011
@@ -17,6 +17,7 @@
package org.apache.mahout.common.nlp;
+import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
@@ -27,7 +28,7 @@ import java.util.Map;
public class NGrams {
- private static final Splitter SPACE = Splitter.on(' ');
+ private static final Splitter SPACE_TAB = Splitter.on(CharMatcher.anyOf(" \t"));
private final String line;
private final int gramSize;
@@ -40,7 +41,7 @@ public class NGrams {
public Map<String,List<String>> generateNGrams() {
Map<String,List<String>> returnDocument = Maps.newHashMap();
- Iterator<String> tokenizer = SPACE.split(line).iterator();
+ Iterator<String> tokenizer = SPACE_TAB.split(line).iterator();
List<String> tokens = Lists.newArrayList();
String labelName = tokenizer.next();
List<String> previousN1Grams = Lists.newArrayList();
@@ -70,7 +71,7 @@ public class NGrams {
List<String> tokens = Lists.newArrayList();
List<String> previousN1Grams = Lists.newArrayList();
- for (String nextToken : SPACE.split(line)) {
+ for (String nextToken : SPACE_TAB.split(line)) {
if (previousN1Grams.size() == gramSize) {
previousN1Grams.remove(0);
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/encoders/TextValueEncoder.java Tue Jul 5 07:18:31 2011
@@ -35,9 +35,10 @@ import java.util.regex.Pattern;
*/
public class TextValueEncoder extends FeatureVectorEncoder {
- private static final double LOG_2 = Math.log(2);
+ private static final double LOG_2 = Math.log(2.0);
+
+ private static final Splitter ON_NON_WORD = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
- private final Splitter onNonWord = Splitter.on(Pattern.compile("\\W+")).omitEmptyStrings();
private FeatureVectorEncoder wordEncoder;
private final Multiset<String> counts;
@@ -110,7 +111,7 @@ public class TextValueEncoder extends Fe
* @see LuceneTextValueEncoder
*/
protected Iterable<String> tokenize(CharSequence originalForm) {
- return onNonWord.split(originalForm);
+ return ON_NON_WORD.split(originalForm);
}
/**
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/classifier/sgd/OnlineBaseTest.java Tue Jul 5 07:18:31 2011
@@ -126,7 +126,7 @@ public abstract class OnlineBaseTest ext
* @throws IOException If there is an error reading the data
*/
protected static Matrix readCsv(String resourceName) throws IOException {
- Splitter onCommas = Splitter.on(",").trimResults(CharMatcher.anyOf(" \""));
+ Splitter onCommas = Splitter.on(',').trimResults(CharMatcher.anyOf(" \""));
Readable isr = new InputStreamReader(Resources.getResource(resourceName).openStream(), Charsets.UTF_8);
List<String> data = CharStreams.readLines(isr);
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java?rev=1142907&r1=1142906&r2=1142907&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/sgd/SimpleCsvExamples.java Tue Jul 5 07:18:31 2011
@@ -59,8 +59,7 @@ import java.util.Random;
*/
public final class SimpleCsvExamples {
- public static final int SEPARATOR_CHAR = '\t';
- public static final String SEPARATOR = "\t";
+ public static final char SEPARATOR_CHAR = '\t';
private static final int FIELDS = 100;
private SimpleCsvExamples() {
@@ -137,8 +136,8 @@ public final class SimpleCsvExamples {
private static final class Line {
- private static final Splitter ON_TABS = Splitter.on(SEPARATOR).trimResults();
- public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR);
+ private static final Splitter ON_TABS = Splitter.on(SEPARATOR_CHAR).trimResults();
+ public static final Joiner WITH_COMMAS = Joiner.on(SEPARATOR_CHAR);
public static final Random rand = RandomUtils.getRandom();