You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dr...@apache.org on 2010/05/11 04:40:15 UTC

svn commit: r942986 - in /lucene/mahout/trunk: ./ examples/src/main/java/org/apache/mahout/analysis/ utils/src/main/java/org/apache/mahout/text/ utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/ utils/src/main/java/org/apache/mahout/uti...

Author: drew
Date: Tue May 11 02:40:14 2010
New Revision: 942986

URL: http://svn.apache.org/viewvc?rev=942986&view=rev
Log:
Applied MAHOUT-388: Upgrade Lucene (to 3.0.1)

Added:
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java
Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
    lucene/mahout/trunk/pom.xml
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
    lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
    lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java Tue May 11 02:40:14 2010
@@ -29,11 +29,11 @@ import org.apache.lucene.analysis.standa
 import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
 
 public class WikipediaAnalyzer extends Analyzer {
-  
+ 
   private final CharArraySet stopSet;
   
   public WikipediaAnalyzer() {
-    stopSet = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+    stopSet = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS_SET.toArray(new String[0]));
   }
   
   public WikipediaAnalyzer(CharArraySet stopSet) {

Modified: lucene/mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/pom.xml?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/pom.xml (original)
+++ lucene/mahout/trunk/pom.xml Tue May 11 02:40:14 2010
@@ -81,7 +81,7 @@
     <properties>
         <junit.version>4.7</junit.version>
         <tagBase>https://svn.apache.org/repos/asf/lucene/mahout</tagBase>
-        <lucene.version>2.9.1</lucene.version>
+        <lucene.version>3.0.1</lucene.version>
         <hadoop.version>0.20.2</hadoop.version>
         <maven.clover.multiproject>true</maven.clover.multiproject>
         <collections.version>1.0</collections.version>

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java?rev=942986&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java Tue May 11 02:40:14 2010
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+
+/** A subclass of the Lucene StandardAnalyzer that provides a no-argument constructor. 
+ *  Used as the default analyzer in many cases where an analyzer is instantiated by
+ *  class name by calling a no-arg constructor.
+ */
+public class DefaultAnalyzer extends StandardAnalyzer {
+	@SuppressWarnings("deprecation")
+	public DefaultAnalyzer() {
+		super(Version.LUCENE_CURRENT);
+	}
+}

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Tue May 11 02:40:14 2010
@@ -27,7 +27,6 @@ import org.apache.commons.cli2.builder.G
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.hadoop.fs.Path;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer;
@@ -168,7 +167,7 @@ public final class SparseVectorsFromSequ
       }
       log.info("Pass1 reduce tasks: {}", reduceTasks);
       
-      Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+      Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
       if (cmdLine.hasOption(analyzerNameOpt)) {
         String className = cmdLine.getValue(analyzerNameOpt).toString();
         analyzerClass = (Class<? extends Analyzer>) Class.forName(className);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Tue May 11 02:40:14 2010
@@ -42,9 +42,9 @@ import org.apache.hadoop.mapred.lib.Iden
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.text.DefaultAnalyzer;
 import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
 import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 import org.slf4j.Logger;
@@ -180,7 +180,7 @@ public class CollocDriver extends Config
       if (cmdLine.hasOption(preprocessOpt)) {
         log.info("Input will be preprocessed");
         
-        Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+        Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
         if (cmdLine.hasOption(analyzerNameOpt)) {
           String className = cmdLine.getValue(analyzerNameOpt).toString();
           analyzerClass = Class.forName(className).asSubclass(Analyzer.class);

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Tue May 11 02:40:14 2010
@@ -271,7 +271,7 @@ public class ClusterLabels {
 
     OpenBitSet bitset = new OpenBitSet(numDocs);
 
-    FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+    FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet());
 
     for (int i = 0; i < numDocs; i++) {
       String id = null;

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Tue May 11 02:40:14 2010
@@ -70,7 +70,7 @@ public class LuceneIterable implements I
     if (normPower != NO_NORMALIZING && normPower < 0) {
       throw new IllegalArgumentException("normPower must either be -1 or >= 0");
     }
-    idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+    idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet());
     this.indexReader = reader;
     this.idField = idField;
     this.field = field;

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Tue May 11 02:40:14 2010
@@ -28,9 +28,9 @@ import org.apache.hadoop.mapred.OutputCo
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.text.DefaultAnalyzer;
 import org.apache.mahout.utils.vectors.text.DocumentProcessor;
 
 /**
@@ -60,7 +60,7 @@ public class SequenceFileTokenizerMapper
     try {
       ClassLoader ccl = Thread.currentThread().getContextClassLoader();
       Class<?> cl = ccl
-          .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, StandardAnalyzer.class.getName()));
+          .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()));
       analyzer = (Analyzer) cl.newInstance();
     } catch (ClassNotFoundException e) {
       throw new IllegalStateException(e);

Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Tue May 11 02:40:14 2010
@@ -27,9 +27,9 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.mahout.common.MahoutTestCase;
 import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.text.DefaultAnalyzer;
 import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
 
 /**
@@ -113,7 +113,7 @@ public class DictionaryVectorizerTest ex
         getRandomDocument()));
     }
     writer.close();
-    Class<? extends Analyzer> analyzer = StandardAnalyzer.class;
+    Class<? extends Analyzer> analyzer = DefaultAnalyzer.class;
     DocumentProcessor.tokenizeDocuments(path, analyzer,
     getTestTempDirPath("output/tokenized-documents"));
     DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),