You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@mahout.apache.org by deneche abdelhakim <ad...@apache.org> on 2009/09/20 16:45:16 UTC

Re: svn commit: r816569 - in /lucene/mahout/trunk/examples/src: main/java/org/apache/mahout/classifier/bayes/ main/java/org/apache/mahout/clustering/meanshift/ main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/ main/java/org/apache/mahou

The change in "examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java"
could lead to a bug. The problem is in the following modification:

-      rng = new MersenneTwisterRNG(split.getSeed());
+      rng = RandomUtils.getRandom();

rng is supposed to use the seed given by split.

I tried to correct this line my self, but I'm having problems
committing the change. I'm getting the following message from svn:

svn: Commit failed (details follow):
svn: Server sent unexpected return value (403 Forbidden) in response
to MKACTIVITY request for
'/repos/asf/!svn/act/627fc1d8-98ad-4046-ae77-41962e731928'

although I successfully committed my changes to the site.


On Fri, Sep 18, 2009 at 11:01 AM,  <sr...@apache.org> wrote:
> Author: srowen
> Date: Fri Sep 18 10:01:12 2009
> New Revision: 816569
>
> URL: http://svn.apache.org/viewvc?rev=816569&view=rev
> Log:
> Bit of cleanup and, I think, a fix to the WikipediaDatasetCreatorMapper?
>
> Modified:
>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java
>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java
>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java
>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java
>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
>
> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Fri Sep 18 10:01:12 2009
> @@ -42,12 +42,15 @@
>
>  public class WikipediaDatasetCreatorMapper extends MapReduceBase implements
>     Mapper<LongWritable, Text, Text, Text> {
> -  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
>
> -  private static Set<String> inputCategories = null;
> -  private static boolean exactMatchOnly = false;
> -  private static Analyzer analyzer;
> +  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
>   private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
> +  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
> +  private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
> +
> +  private Set<String> inputCategories = null;
> +  private boolean exactMatchOnly = false;
> +  private Analyzer analyzer;
>
>   @Override
>   public void map(LongWritable key, Text value,
> @@ -59,7 +62,7 @@
>     String catMatch = findMatchingCategory(document);
>
>     if(!catMatch.equals("Unknown")){
> -      document = StringEscapeUtils.unescapeHtml(document.replaceFirst("<text xml:space=\"preserve\">", "").replaceAll("</text>", ""));
> +      document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
>       TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
>       Token token = new Token();
>       while((token = stream.next(token)) != null){
> @@ -69,18 +72,19 @@
>     }
>   }
>
> -  public static String findMatchingCategory(String document){
> +  private String findMatchingCategory(String document){
>     int startIndex = 0;
>     int categoryIndex;
> -    String match = null; // TODO this is never updated?
>     while((categoryIndex = document.indexOf("[[Category:", startIndex))!=-1)
>     {
>       categoryIndex+=11;
>       int endIndex = document.indexOf("]]", categoryIndex);
> -      if(endIndex>=document.length() || endIndex < 0) break;
> +      if (endIndex >= document.length() || endIndex < 0) {
> +        break;
> +      }
>       String category = document.substring(categoryIndex, endIndex).toLowerCase().trim();
>       //categories.add(category.toLowerCase());
> -      if (exactMatchOnly == true && inputCategories.contains(category)){
> +      if (exactMatchOnly && inputCategories.contains(category)){
>         return category;
>       } else if (exactMatchOnly == false){
>         for (String inputCategory : inputCategories) {
> @@ -91,17 +95,12 @@
>       }
>       startIndex = endIndex;
>     }
> -    if (match == null){
> -      match = "Unknown";
> -    }
> -
> -    return match;
> +    return "Unknown";
>   }
>
>   @Override
>   public void configure(JobConf job) {
>     try {
> -      //Is this thread-safe?
>       if (inputCategories == null){
>         Set<String> newCategories = new HashSet<String>();
>
>
> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Fri Sep 18 10:01:12 2009
> @@ -39,7 +39,8 @@
>
>   private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
>
> -  private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
> +  // TODO this is never queried?
> +  //private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
>
>   @Override
>   public void paint(Graphics g) {
> @@ -87,7 +88,7 @@
>         done = canopy.shiftToMean() && done;
>         MeanShiftCanopy.mergeCanopy(canopy, migratedCanopies);
>       }
> -      iterationCenters.add(centers);
> +      //iterationCenters.add(centers);
>       canopies = migratedCanopies;
>     }
>   }
>
> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java (original)
> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java Fri Sep 18 10:01:12 2009
> @@ -36,10 +36,9 @@
>  public class InputMapper extends MapReduceBase implements
>     Mapper<LongWritable, Text, Text, Vector> {
>
> -  private static final Pattern SPACE = java.util.regex.Pattern.compile(" ");
> +  private static final Pattern SPACE = Pattern.compile(" ");
>
> -  protected Class<? extends Vector> outputClass;
> -  protected Constructor<?> constructor;
> +  private Constructor<?> constructor;
>
>   @Override
>   public void map(LongWritable key, Text values,
> @@ -70,7 +69,7 @@
>
>   @Override
>   public void configure(JobConf job) {
> -    outputClass = (Class<? extends Vector>) job.getOutputValueClass();
> +    Class<? extends Vector> outputClass = (Class<? extends Vector>) job.getOutputValueClass();
>     try {
>       constructor = outputClass.getConstructor(int.class);
>     } catch (NoSuchMethodException e) {
>
> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java (original)
> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java Fri Sep 18 10:01:12 2009
> @@ -82,7 +82,7 @@
>
>     DataSet dataset = DataSet.getDataSet();
>
> -    for (int condInd = 0; condInd < getNbConditions(); condInd++) {
> +    for (int condInd = 0; condInd < nbConditions; condInd++) {
>       int attrInd = attributeIndex(condInd);
>
>       setW(condInd, rng.nextDouble());
>
> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java (original)
> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java Fri Sep 18 10:01:12 2009
> @@ -27,6 +27,7 @@
>  import org.apache.hadoop.mapred.Reporter;
>  import org.apache.hadoop.mapred.TextInputFormat;
>  import org.apache.mahout.common.StringUtils;
> +import org.apache.mahout.common.RandomUtils;
>  import org.uncommons.maths.random.MersenneTwisterRNG;
>
>  import java.io.IOException;
> @@ -62,7 +63,7 @@
>   }
>
>   public DatasetSplit(double threshold) {
> -    this(new MersenneTwisterRNG().getSeed(), threshold);
> +    this(((MersenneTwisterRNG) RandomUtils.getRandom()).getSeed(), threshold);
>   }
>
>   public DatasetSplit(JobConf conf) {
> @@ -144,7 +145,7 @@
>
>       DatasetSplit split = new DatasetSplit(conf);
>
> -      rng = new MersenneTwisterRNG(split.getSeed());
> +      rng = RandomUtils.getRandom();
>       threshold = split.getThreshold();
>       training = split.isTraining();
>     }
>
> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java (original)
> +++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java Fri Sep 18 10:01:12 2009
> @@ -133,7 +133,7 @@
>   }
>
>   static String printRule(CDRule rule) {
> -    StringBuffer buffer = new StringBuffer();
> +    StringBuilder buffer = new StringBuilder();
>
>     for (int index = 0; index < rule.getNbConditions(); index++) {
>       buffer.append(rule.getO(index) ? 1 : 0);
>
> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java (original)
> +++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java Fri Sep 18 10:01:12 2009
> @@ -31,17 +31,12 @@
>
>  public class CDMapperTest extends TestCase {
>
> -  DataLine dl;
> -
> -  Rule rule;
> -
> -  final CDFitness TP = new CDFitness(1, 0, 0, 0);
> -
> -  final CDFitness FP = new CDFitness(0, 1, 0, 0);
> -
> -  final CDFitness TN = new CDFitness(0, 0, 1, 0);
> -
> -  final CDFitness FN = new CDFitness(0, 0, 0, 1);
> +  private DataLine dl;
> +  private Rule rule;
> +  private final CDFitness TP = new CDFitness(1, 0, 0, 0);
> +  private final CDFitness FP = new CDFitness(0, 1, 0, 0);
> +  private final CDFitness TN = new CDFitness(0, 0, 1, 0);
> +  private final CDFitness FN = new CDFitness(0, 0, 0, 1);
>
>   @Override
>   protected void setUp() throws Exception {
>
> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java?rev=816569&r1=816568&r2=816569&view=diff
> ==============================================================================
> --- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java (original)
> +++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java Fri Sep 18 10:01:12 2009
> @@ -82,7 +82,7 @@
>         int nbvalues = rng.nextInt(50) + 1;
>         descriptions[index] = new Object[nbvalues];
>         for (int vindex = 0; vindex < nbvalues; vindex++) {
> -          descriptions[index][vindex] = "val_" + index + "_" + vindex;
> +          descriptions[index][vindex] = "val_" + index + '_' + vindex;
>         }
>       }
>     }
> @@ -109,7 +109,7 @@
>   }
>
>   private String randomLine(Descriptors descriptors, Object[][] descriptions) {
> -    StringBuffer buffer = new StringBuffer();
> +    StringBuilder buffer = new StringBuilder();
>
>     for (int index = 0; index < descriptors.size(); index++) {
>       if (descriptors.isNumerical(index)) {
> @@ -138,7 +138,7 @@
>     return buffer.toString();
>   }
>
> -  private int nbNonIgnored(Descriptors descriptors) {
> +  private static int nbNonIgnored(Descriptors descriptors) {
>     int nbattrs = 0;
>     for (int index = 0; index < descriptors.size(); index++) {
>       if (!descriptors.isIgnored(index))
> @@ -191,9 +191,9 @@
>         double max = (Double) descriptions[index][1];
>         Range range = DescriptionUtils.extractNumericalRange(description);
>
> -        assertTrue("bad min value for attribute (" + index + ")",
> +        assertTrue("bad min value for attribute (" + index + ')',
>             min <= range.min);
> -        assertTrue("bad max value for attribute (" + index + ")",
> +        assertTrue("bad max value for attribute (" + index + ')',
>             max >= range.max);
>       } else if (descriptors.isNominal(index)) {
>         // categorical attribute
>
>
>

Re: svn commit: r816569 - in /lucene/mahout/trunk/examples/src: main/java/org/apache/mahout/classifier/bayes/ main/java/org/apache/mahout/clustering/meanshift/ main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/ main/java/org/apache/mahou

Posted by deneche abdelhakim <ad...@gmail.com>.
yes its meant to be run twice, one time selecting the training samples
and the next time the testing samples. It assumes that RNG will return
the exact same numbers twice.

On Mon, Sep 21, 2009 at 1:54 PM, Sean Owen <sr...@gmail.com> wrote:
> I rolled it back. So the reader depends on the seed and the exact
> behavior of the RNG? I have no doubt it is needed if intended, just
> checking that it's intended.
>
> (I also fixed build-reuters.sh)
>
> On Sun, Sep 20, 2009 at 1:55 PM, Sean Owen <sr...@gmail.com> wrote:
>> Sorry I will investigate when back at my workstation. I remember
>> something like this but thought I preserved the seed. Guess I missed
>> something. My bad, I try not to ever change semantics.
>

Re: svn commit: r816569 - in /lucene/mahout/trunk/examples/src: main/java/org/apache/mahout/classifier/bayes/ main/java/org/apache/mahout/clustering/meanshift/ main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/ main/java/org/apache/mahou

Posted by Sean Owen <sr...@gmail.com>.
I rolled it back. So the reader depends on the seed and the exact
behavior of the RNG? I have no doubt it is needed if intended, just
checking that it's intended.

(I also fixed build-reuters.sh)

On Sun, Sep 20, 2009 at 1:55 PM, Sean Owen <sr...@gmail.com> wrote:
> Sorry I will investigate when back at my workstation. I remember
> something like this but thought I preserved the seed. Guess I missed
> something. My bad, I try not to ever change semantics.

Re: svn commit: r816569 - in /lucene/mahout/trunk/examples/src: main/java/org/apache/mahout/classifier/bayes/ main/java/org/apache/mahout/clustering/meanshift/ main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/ main/java/org/apache/mahou

Posted by Sean Owen <sr...@gmail.com>.
Sorry I will investigate when back at my workstation. I remember
something like this but thought I preserved the seed. Guess I missed
something. My bad, I try not to ever change semantics.

On Sunday, September 20, 2009, deneche abdelhakim <ad...@apache.org> wrote:
> The change in "examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java"
> could lead to a bug. The problem is in the following modification:
>
> -      rng = new MersenneTwisterRNG(split.getSeed());
> +      rng = RandomUtils.getRandom();
>
> rng is supposed to use the seed given by split.
>
> I tried to correct this line my self, but I'm having problems
> committing the change. I'm getting the following message from svn:
>
> svn: Commit failed (details follow):
> svn: Server sent unexpected return value (403 Forbidden) in response
> to MKACTIVITY request for
> '/repos/asf/!svn/act/627fc1d8-98ad-4046-ae77-41962e731928'
>
> although I successfully committed my changes to the site.
>
>
> On Fri, Sep 18, 2009 at 11:01 AM,  <sr...@apache.org> wrote:
>> Author: srowen
>> Date: Fri Sep 18 10:01:12 2009
>> New Revision: 816569
>>
>> URL: http://svn.apache.org/viewvc?rev=816569&view=rev
>> Log:
>> Bit of cleanup and, I think, a fix to the WikipediaDatasetCreatorMapper?
>>
>> Modified:
>>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
>>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
>>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
>>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java
>>    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java
>>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java
>>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java
>>    lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
>>
>> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
>> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Fri Sep 18 10:01:12 2009
>> @@ -42,12 +42,15 @@
>>
>>  public class WikipediaDatasetCreatorMapper extends MapReduceBase implements
>>     Mapper<LongWritable, Text, Text, Text> {
>> -  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
>>
>> -  private static Set<String> inputCategories = null;
>> -  private static boolean exactMatchOnly = false;
>> -  private static Analyzer analyzer;
>> +  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
>>   private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
>> +  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
>> +  private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
>> +
>> +  private Set<String> inputCategories = null;
>> +  private boolean exactMatchOnly = false;
>> +  private Analyzer analyzer;
>>
>>   @Override
>>   public void map(LongWritable key, Text value,
>> @@ -59,7 +62,7 @@
>>     String catMatch = findMatchingCategory(document);
>>
>>     if(!catMatch.equals("Unknown")){
>> -      document = StringEscapeUtils.unescapeHtml(document.replaceFirst("<text xml:space=\"preserve\">", "").replaceAll("</text>", ""));
>> +      document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
>>       TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
>>       Token token = new Token();
>>       while((token = stream.next(token)) != null){
>> @@ -69,18 +72,19 @@
>>     }
>>   }
>>
>> -  public static String findMatchingCategory(String document){
>> +  private String findMatchingCategory(String document){
>>     int startIndex = 0;
>>     int categoryIndex;
>> -    String match = null; // TODO this is never updated?
>>     while((categoryIndex = document.indexOf("[[Category:", startIndex))!=-1)
>>     {
>>       categoryIndex+=11;
>>       int endIndex = document.indexOf("]]", categoryIndex);
>> -      if(endIndex>=document.length() || endIndex < 0) break;
>> +      if (endIndex >= document.length() || endIndex < 0) {
>> +        break;
>> +      }
>>       String category = document.substring(categoryIndex, endIndex).toLowerCase().trim();
>>       //categories.add(category.toLowerCase());
>> -      if (exactMatchOnly == true && inputCategories.contains(category)){
>> +      if (exactMatchOnly && inputCategories.contains(category)){
>>         return category;
>>       } else if (exactMatchOnly == false){
>>         for (String inputCategory : inputCategories) {
>> @@ -91,17 +95,12 @@
>>       }
>>       startIndex = endIndex;
>>     }
>> -    if (match == null){
>> -      match = "Unknown";
>> -    }
>> -
>> -    return match;
>> +    return "Unknown";
>>   }
>>
>>   @Override
>>   public void configure(JobConf job) {
>>     try {
>> -      //Is this thread-safe?
>>       if (inputCategories == null){
>>         Set<String> newCategories = new HashSet<String>();
>>
>>
>> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java (original)
>> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/meanshift/DisplayMeanShift.java Fri Sep 18 10:01:12 2009
>> @@ -39,7 +39,8 @@
>>
>>   private static List<MeanShiftCanopy> canopies = new ArrayList<MeanShiftCanopy>();
>>
>> -  private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
>> +  // TODO this is never queried?
>> +  //private static final List<List<Vector>> iterationCenters = new ArrayList<List<Vector>>();
>>
>>   @Override
>>   public void paint(Graphics g) {
>> @@ -87,7 +88,7 @@
>>         done = canopy.shiftToMean() && done;
>>         MeanShiftCanopy.mergeCanopy(canopy, migratedCanopies);
>>       }
>> -      iterationCenters.add(centers);
>> +      //iterationCenters.add(centers);
>>       canopies = migratedCanopies;
>>     }
>>   }
>>
>> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java (original)
>> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/clustering/syntheticcontrol/canopy/InputMapper.java Fri Sep 18 10:01:12 2009
>> @@ -36,10 +36,9 @@
>>  public class InputMapper extends MapReduceBase implements
>>     Mapper<LongWritable, Text, Text, Vector> {
>>
>> -  private static final Pattern SPACE = java.util.regex.Pattern.compile(" ");
>> +  private static final Pattern SPACE = Pattern.compile(" ");
>>
>> -  protected Class<? extends Vector> outputClass;
>> -  protected Constructor<?> constructor;
>> +  private Constructor<?> constructor;
>>
>>   @Override
>>   public void map(LongWritable key, Text values,
>> @@ -70,7 +69,7 @@
>>
>>   @Override
>>   public void configure(JobConf job) {
>> -    outputClass = (Class<? extends Vector>) job.getOutputValueClass();
>> +    Class<? extends Vector> outputClass = (Class<? extends Vector>) job.getOutputValueClass();
>>     try {
>>       constructor = outputClass.getConstructor(int.class);
>>     } catch (NoSuchMethodException e) {
>>
>> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java (original)
>> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/CDRule.java Fri Sep 18 10:01:12 2009
>> @@ -82,7 +82,7 @@
>>
>>     DataSet dataset = DataSet.getDataSet();
>>
>> -    for (int condInd = 0; condInd < getNbConditions(); condInd++) {
>> +    for (int condInd = 0; condInd < nbConditions; condInd++) {
>>       int attrInd = attributeIndex(condInd);
>>
>>       setW(condInd, rng.nextDouble());
>>
>> Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java (original)
>> +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/ga/watchmaker/cd/hadoop/DatasetSplit.java Fri Sep 18 10:01:12 2009
>> @@ -27,6 +27,7 @@
>>  import org.apache.hadoop.mapred.Reporter;
>>  import org.apache.hadoop.mapred.TextInputFormat;
>>  import org.apache.mahout.common.StringUtils;
>> +import org.apache.mahout.common.RandomUtils;
>>  import org.uncommons.maths.random.MersenneTwisterRNG;
>>
>>  import java.io.IOException;
>> @@ -62,7 +63,7 @@
>>   }
>>
>>   public DatasetSplit(double threshold) {
>> -    this(new MersenneTwisterRNG().getSeed(), threshold);
>> +    this(((MersenneTwisterRNG) RandomUtils.getRandom()).getSeed(), threshold);
>>   }
>>
>>   public DatasetSplit(JobConf conf) {
>> @@ -144,7 +145,7 @@
>>
>>       DatasetSplit split = new DatasetSplit(conf);
>>
>> -      rng = new MersenneTwisterRNG(split.getSeed());
>> +      rng = RandomUtils.getRandom();
>>       threshold = split.getThreshold();
>>       training = split.isTraining();
>>     }
>>
>> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java (original)
>> +++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/CDCrossoverTest.java Fri Sep 18 10:01:12 2009
>> @@ -133,7 +133,7 @@
>>   }
>>
>>   static String printRule(CDRule rule) {
>> -    StringBuffer buffer = new StringBuffer();
>> +    StringBuilder buffer = new StringBuilder();
>>
>>     for (int index = 0; index < rule.getNbConditions(); index++) {
>>       buffer.append(rule.getO(index) ? 1 : 0);
>>
>> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java?rev=816569&r1=816568&r2=816569&view=diff
>> ==============================================================================
>> --- lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java (original)
>> +++ lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/hadoop/CDMapperTest.java Fri Sep 18 10:01:12 2009
>> @@ -31,17 +31,12 @@
>>
>>  public class CDMapperTest extends TestCase {
>>
>> -  DataLine dl;
>> -
>> -  Rule rule;
>> -
>> -  final CDFitness TP = new CDFitness(1, 0, 0, 0);
>> -
>> -  final CDFitness FP = new CDFitness(0, 1, 0, 0);
>> -
>> -  final CDFitness TN = new CDFitness(0, 0, 1, 0);
>> -
>> -  final CDFitness FN = new CDFitness(0, 0, 0, 1);
>> +  private DataLine dl;
>> +  private Rule rule;
>> +  private final CDFitness TP = new CDFitness(1, 0, 0, 0);
>> +  private final CDFitness FP = new CDFitness(0, 1, 0, 0);
>> +  private final CDFitness TN = new CDFitness(0, 0, 1, 0);
>> +  private final CDFitness FN = new CDFitness(0, 0, 0, 1);
>>
>>   @Override
>>   protected void setUp() throws Exception {
>>
>> Modified: lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/ga/watchmaker/cd/tool/CDInfosToolTest.java
>> URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/test/java/org/apache/mahout/g