You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dr...@apache.org on 2010/05/11 04:40:15 UTC
svn commit: r942986 - in /lucene/mahout/trunk: ./
examples/src/main/java/org/apache/mahout/analysis/
utils/src/main/java/org/apache/mahout/text/
utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/
utils/src/main/java/org/apache/mahout/uti...
Author: drew
Date: Tue May 11 02:40:14 2010
New Revision: 942986
URL: http://svn.apache.org/viewvc?rev=942986&view=rev
Log:
Applied MAHOUT-388: Upgrade Lucene (to 3.0.1)
Added:
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
lucene/mahout/trunk/pom.xml
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/analysis/WikipediaAnalyzer.java Tue May 11 02:40:14 2010
@@ -29,11 +29,11 @@ import org.apache.lucene.analysis.standa
import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
public class WikipediaAnalyzer extends Analyzer {
-
+
private final CharArraySet stopSet;
public WikipediaAnalyzer() {
- stopSet = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
+ stopSet = (CharArraySet) StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS_SET.toArray(new String[0]));
}
public WikipediaAnalyzer(CharArraySet stopSet) {
Modified: lucene/mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/pom.xml?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/pom.xml (original)
+++ lucene/mahout/trunk/pom.xml Tue May 11 02:40:14 2010
@@ -81,7 +81,7 @@
<properties>
<junit.version>4.7</junit.version>
<tagBase>https://svn.apache.org/repos/asf/lucene/mahout</tagBase>
- <lucene.version>2.9.1</lucene.version>
+ <lucene.version>3.0.1</lucene.version>
<hadoop.version>0.20.2</hadoop.version>
<maven.clover.multiproject>true</maven.clover.multiproject>
<collections.version>1.0</collections.version>
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java?rev=942986&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/DefaultAnalyzer.java Tue May 11 02:40:14 2010
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.text;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.util.Version;
+
+/** A subclass of the Lucene StandardAnalyzer that provides a no-argument constructor.
+ * Used as the default analyzer in many cases where an analyzer is instantiated by
+ * class name by calling a no-arg constructor.
+ */
+public class DefaultAnalyzer extends StandardAnalyzer {
+ @SuppressWarnings("deprecation")
+ public DefaultAnalyzer() {
+ super(Version.LUCENE_CURRENT);
+ }
+}
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SparseVectorsFromSequenceFiles.java Tue May 11 02:40:14 2010
@@ -27,7 +27,6 @@ import org.apache.commons.cli2.builder.G
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.utils.nlp.collocations.llr.LLRReducer;
@@ -168,7 +167,7 @@ public final class SparseVectorsFromSequ
}
log.info("Pass1 reduce tasks: {}", reduceTasks);
- Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+ Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
String className = cmdLine.getValue(analyzerNameOpt).toString();
analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/CollocDriver.java Tue May 11 02:40:14 2010
@@ -42,9 +42,9 @@ import org.apache.hadoop.mapred.lib.Iden
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
+import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.vectors.text.DictionaryVectorizer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
import org.slf4j.Logger;
@@ -180,7 +180,7 @@ public class CollocDriver extends Config
if (cmdLine.hasOption(preprocessOpt)) {
log.info("Input will be preprocessed");
- Class<? extends Analyzer> analyzerClass = StandardAnalyzer.class;
+ Class<? extends Analyzer> analyzerClass = DefaultAnalyzer.class;
if (cmdLine.hasOption(analyzerNameOpt)) {
String className = cmdLine.getValue(analyzerNameOpt).toString();
analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Tue May 11 02:40:14 2010
@@ -271,7 +271,7 @@ public class ClusterLabels {
OpenBitSet bitset = new OpenBitSet(numDocs);
- FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+ FieldSelector idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet());
for (int i = 0; i < numDocs; i++) {
String id = null;
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Tue May 11 02:40:14 2010
@@ -70,7 +70,7 @@ public class LuceneIterable implements I
if (normPower != NO_NORMALIZING && normPower < 0) {
throw new IllegalArgumentException("normPower must either be -1 or >= 0");
}
- idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.emptySet());
+ idFieldSelector = new SetBasedFieldSelector(Collections.singleton(idField), Collections.<String>emptySet());
this.indexReader = reader;
this.idField = idField;
this.field = field;
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Tue May 11 02:40:14 2010
@@ -28,9 +28,9 @@ import org.apache.hadoop.mapred.OutputCo
import org.apache.hadoop.mapred.Reporter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.vectors.text.DocumentProcessor;
/**
@@ -60,7 +60,7 @@ public class SequenceFileTokenizerMapper
try {
ClassLoader ccl = Thread.currentThread().getContextClassLoader();
Class<?> cl = ccl
- .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, StandardAnalyzer.class.getName()));
+ .loadClass(job.get(DocumentProcessor.ANALYZER_CLASS, DefaultAnalyzer.class.getName()));
analyzer = (Analyzer) cl.newInstance();
} catch (ClassNotFoundException e) {
throw new IllegalStateException(e);
Modified: lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java?rev=942986&r1=942985&r2=942986&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java (original)
+++ lucene/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizerTest.java Tue May 11 02:40:14 2010
@@ -27,9 +27,9 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.RandomUtils;
+import org.apache.mahout.text.DefaultAnalyzer;
import org.apache.mahout.utils.vectors.tfidf.TFIDFConverter;
/**
@@ -113,7 +113,7 @@ public class DictionaryVectorizerTest ex
getRandomDocument()));
}
writer.close();
- Class<? extends Analyzer> analyzer = StandardAnalyzer.class;
+ Class<? extends Analyzer> analyzer = DefaultAnalyzer.class;
DocumentProcessor.tokenizeDocuments(path, analyzer,
getTestTempDirPath("output/tokenized-documents"));
DictionaryVectorizer.createTermFrequencyVectors(getTestTempDirPath("output/tokenized-documents"),