You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2012/06/05 14:11:22 UTC

svn commit: r1346364 - in /mahout/trunk: ./ bin/ core/src/main/java/org/apache/mahout/vectorizer/ examples/src/main/java/org/apache/mahout/classifier/ integration/ integration/src/main/java/org/apache/mahout/text/

Author: robinanil
Date: Tue Jun  5 12:11:21 2012
New Revision: 1346364

URL: http://svn.apache.org/viewvc?rev=1346364&view=rev
Log:
MAHOUT-1027 upgrade to latest lucene, move solr dependency to integration

Modified:
    mahout/trunk/bin/mahout
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
    mahout/trunk/integration/pom.xml
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
    mahout/trunk/pom.xml

Modified: mahout/trunk/bin/mahout
URL: http://svn.apache.org/viewvc/mahout/trunk/bin/mahout?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/bin/mahout (original)
+++ mahout/trunk/bin/mahout Tue Jun  5 12:11:21 2012
@@ -95,7 +95,7 @@ if [ "$JAVA_HOME" = "" ]; then
 fi
 
 JAVA=$JAVA_HOME/bin/java
-JAVA_HEAP_MAX=-Xmx4g 
+JAVA_HEAP_MAX=-Xmx4096m 
 
 # check envvars which might override default args
 if [ "$MAHOUT_HEAPSIZE" != "" ]; then

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java Tue Jun  5 12:11:21 2012
@@ -31,7 +31,7 @@ import java.io.Reader;
  */
 public final class DefaultAnalyzer extends Analyzer {
 
-  private final StandardAnalyzer stdAnalyzer = new StandardAnalyzer(Version.LUCENE_31);
+  private final StandardAnalyzer stdAnalyzer = new StandardAnalyzer(Version.LUCENE_36);
 
   @Override
   public TokenStream tokenStream(String fieldName, Reader reader) {

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java Tue Jun  5 12:11:21 2012
@@ -34,7 +34,6 @@ import org.slf4j.LoggerFactory;
  * Converts a given set of sequence files into SparseVectors
  */
 public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
-
   private static final Logger log = LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
 
   public static void main(String[] args) throws Exception {

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java Tue Jun  5 12:11:21 2012
@@ -46,7 +46,8 @@ import java.util.Locale;
 import java.util.Random;
 
 public final class NewsgroupHelper {
-
+  private static final Version LUCENE_VERSION = Version.LUCENE_36;
+  
   private static final SimpleDateFormat[] DATE_FORMATS = {
           new SimpleDateFormat("", Locale.ENGLISH),
           new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
@@ -59,7 +60,7 @@ public final class NewsgroupHelper {
   private static final long WEEK = 7 * 24 * 3600;
   
   private final Random rand = RandomUtils.getRandom();  
-  private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
+  private final Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
   private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
   private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
   

Modified: mahout/trunk/integration/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/pom.xml?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/integration/pom.xml (original)
+++ mahout/trunk/integration/pom.xml Tue Jun  5 12:11:21 2012
@@ -118,11 +118,6 @@
       <scope>test</scope>
     </dependency>
 
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>mahout-core</artifactId>
-    </dependency>
-
       <!-- 3rd party -->
 
     <dependency>
@@ -138,6 +133,7 @@
     <dependency>
       <groupId>org.apache.solr</groupId>
       <artifactId>solr-commons-csv</artifactId>
+      <version>3.5.0</version>
     </dependency>
 
     <dependency>

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java Tue Jun  5 12:11:21 2012
@@ -43,11 +43,12 @@ import org.apache.lucene.util.Version;
  * stop words, excluding non-alpha-numeric tokens, and porter stemming.
  */
 public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
+  private static final Version LUCENE_VERSION = Version.LUCENE_36;
   
   // extended set of stop words composed of common mail terms like "hi",
   // HTML tags, and Java keywords asmany of the messages in the archives
   // are subversion check-in notifications
-  private static final Set<?> STOP_WORDS = new CharArraySet(Version.LUCENE_31, Arrays.asList(
+  private static final Set<?> STOP_WORDS = new CharArraySet(LUCENE_VERSION, Arrays.asList(
     "3d","7bit","a0","about","above","abstract","across","additional","after",
     "afterwards","again","against","align","all","almost","alone","along",
     "already","also","although","always","am","among","amongst","amoungst",
@@ -104,23 +105,24 @@ public final class MailArchivesClusterin
 
   // Regex used to exclude non-alpha-numeric tokens
   private static final Pattern alphaNumeric = Pattern.compile("^[a-z][a-z0-9_]+$");
+  private final static Matcher matcher = alphaNumeric.matcher("");
 
   public MailArchivesClusteringAnalyzer() {
-    super(Version.LUCENE_31, STOP_WORDS);
+    super(LUCENE_VERSION, STOP_WORDS);
   }
 
   public MailArchivesClusteringAnalyzer(Set<?> stopSet) {
-    super(Version.LUCENE_31, stopSet);
+    super(LUCENE_VERSION, stopSet);
   }
   
   @Override
   protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_31, reader);
-    TokenStream result = new StandardFilter(Version.LUCENE_31, tokenizer);
-    result = new LowerCaseFilter(Version.LUCENE_31, result);
+    Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
+    TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
+    result = new LowerCaseFilter(LUCENE_VERSION, result);
     result = new ASCIIFoldingFilter(result);
     result = new AlphaNumericMaxLengthFilter(result);
-    result = new StopFilter(Version.LUCENE_31, result, stopwords);
+    result = new StopFilter(LUCENE_VERSION, result, stopwords);
     result = new PorterStemFilter(result);
     return new TokenStreamComponents(tokenizer, result);
   }
@@ -131,12 +133,10 @@ public final class MailArchivesClusterin
   static class AlphaNumericMaxLengthFilter extends TokenFilter {
     private final CharTermAttribute termAtt;
     private final char[] output = new char[28];
-    private final Matcher matcher;
 
     AlphaNumericMaxLengthFilter(TokenStream in) {
       super(in);
       termAtt = addAttribute(CharTermAttribute.class);
-      matcher = alphaNumeric.matcher("foo");
     }
 
     @Override

Modified: mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/pom.xml (original)
+++ mahout/trunk/pom.xml Tue Jun  5 12:11:21 2012
@@ -98,7 +98,7 @@
     <maven.clover.multiproject>true</maven.clover.multiproject>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <hadoop.version>0.20.204.0</hadoop.version>
-    <lucene.version>3.5.0</lucene.version>
+    <lucene.version>3.6.0</lucene.version>
     <mongodb.version>2.5</mongodb.version>
     <cassandra.version>0.8.1</cassandra.version>
   </properties>
@@ -176,11 +176,6 @@
         <artifactId>lucene-benchmark</artifactId>
         <version>${lucene.version}</version>
       </dependency>
-      <dependency>
-        <groupId>org.apache.solr</groupId>
-        <artifactId>solr-commons-csv</artifactId>
-        <version>${lucene.version}</version>
-      </dependency>
       
       <dependency>
         <groupId>junit</groupId>