You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2012/06/05 14:11:22 UTC
svn commit: r1346364 - in /mahout/trunk: ./ bin/
core/src/main/java/org/apache/mahout/vectorizer/
examples/src/main/java/org/apache/mahout/classifier/ integration/
integration/src/main/java/org/apache/mahout/text/
Author: robinanil
Date: Tue Jun 5 12:11:21 2012
New Revision: 1346364
URL: http://svn.apache.org/viewvc?rev=1346364&view=rev
Log:
MAHOUT-1027 upgrade to latest lucene, move solr dependency to integration
Modified:
mahout/trunk/bin/mahout
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
mahout/trunk/integration/pom.xml
mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
mahout/trunk/pom.xml
Modified: mahout/trunk/bin/mahout
URL: http://svn.apache.org/viewvc/mahout/trunk/bin/mahout?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/bin/mahout (original)
+++ mahout/trunk/bin/mahout Tue Jun 5 12:11:21 2012
@@ -95,7 +95,7 @@ if [ "$JAVA_HOME" = "" ]; then
fi
JAVA=$JAVA_HOME/bin/java
-JAVA_HEAP_MAX=-Xmx4g
+JAVA_HEAP_MAX=-Xmx4096m
# check envvars which might override default args
if [ "$MAHOUT_HEAPSIZE" != "" ]; then
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DefaultAnalyzer.java Tue Jun 5 12:11:21 2012
@@ -31,7 +31,7 @@ import java.io.Reader;
*/
public final class DefaultAnalyzer extends Analyzer {
- private final StandardAnalyzer stdAnalyzer = new StandardAnalyzer(Version.LUCENE_31);
+ private final StandardAnalyzer stdAnalyzer = new StandardAnalyzer(Version.LUCENE_36);
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.java Tue Jun 5 12:11:21 2012
@@ -34,7 +34,6 @@ import org.slf4j.LoggerFactory;
* Converts a given set of sequence files into SparseVectors
*/
public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
-
private static final Logger log = LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);
public static void main(String[] args) throws Exception {
Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java (original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/NewsgroupHelper.java Tue Jun 5 12:11:21 2012
@@ -46,7 +46,8 @@ import java.util.Locale;
import java.util.Random;
public final class NewsgroupHelper {
-
+ private static final Version LUCENE_VERSION = Version.LUCENE_36;
+
private static final SimpleDateFormat[] DATE_FORMATS = {
new SimpleDateFormat("", Locale.ENGLISH),
new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH),
@@ -59,7 +60,7 @@ public final class NewsgroupHelper {
private static final long WEEK = 7 * 24 * 3600;
private final Random rand = RandomUtils.getRandom();
- private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
+ private final Analyzer analyzer = new StandardAnalyzer(LUCENE_VERSION);
private final FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
Modified: mahout/trunk/integration/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/pom.xml?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/integration/pom.xml (original)
+++ mahout/trunk/integration/pom.xml Tue Jun 5 12:11:21 2012
@@ -118,11 +118,6 @@
<scope>test</scope>
</dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>mahout-core</artifactId>
- </dependency>
-
<!-- 3rd party -->
<dependency>
@@ -138,6 +133,7 @@
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-commons-csv</artifactId>
+ <version>3.5.0</version>
</dependency>
<dependency>
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/MailArchivesClusteringAnalyzer.java Tue Jun 5 12:11:21 2012
@@ -43,11 +43,12 @@ import org.apache.lucene.util.Version;
* stop words, excluding non-alpha-numeric tokens, and porter stemming.
*/
public final class MailArchivesClusteringAnalyzer extends StopwordAnalyzerBase {
+ private static final Version LUCENE_VERSION = Version.LUCENE_36;
// extended set of stop words composed of common mail terms like "hi",
// HTML tags, and Java keywords asmany of the messages in the archives
// are subversion check-in notifications
- private static final Set<?> STOP_WORDS = new CharArraySet(Version.LUCENE_31, Arrays.asList(
+ private static final Set<?> STOP_WORDS = new CharArraySet(LUCENE_VERSION, Arrays.asList(
"3d","7bit","a0","about","above","abstract","across","additional","after",
"afterwards","again","against","align","all","almost","alone","along",
"already","also","although","always","am","among","amongst","amoungst",
@@ -104,23 +105,24 @@ public final class MailArchivesClusterin
// Regex used to exclude non-alpha-numeric tokens
private static final Pattern alphaNumeric = Pattern.compile("^[a-z][a-z0-9_]+$");
+ private final static Matcher matcher = alphaNumeric.matcher("");
public MailArchivesClusteringAnalyzer() {
- super(Version.LUCENE_31, STOP_WORDS);
+ super(LUCENE_VERSION, STOP_WORDS);
}
public MailArchivesClusteringAnalyzer(Set<?> stopSet) {
- super(Version.LUCENE_31, stopSet);
+ super(LUCENE_VERSION, stopSet);
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
- Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_31, reader);
- TokenStream result = new StandardFilter(Version.LUCENE_31, tokenizer);
- result = new LowerCaseFilter(Version.LUCENE_31, result);
+ Tokenizer tokenizer = new StandardTokenizer(LUCENE_VERSION, reader);
+ TokenStream result = new StandardFilter(LUCENE_VERSION, tokenizer);
+ result = new LowerCaseFilter(LUCENE_VERSION, result);
result = new ASCIIFoldingFilter(result);
result = new AlphaNumericMaxLengthFilter(result);
- result = new StopFilter(Version.LUCENE_31, result, stopwords);
+ result = new StopFilter(LUCENE_VERSION, result, stopwords);
result = new PorterStemFilter(result);
return new TokenStreamComponents(tokenizer, result);
}
@@ -131,12 +133,10 @@ public final class MailArchivesClusterin
static class AlphaNumericMaxLengthFilter extends TokenFilter {
private final CharTermAttribute termAtt;
private final char[] output = new char[28];
- private final Matcher matcher;
AlphaNumericMaxLengthFilter(TokenStream in) {
super(in);
termAtt = addAttribute(CharTermAttribute.class);
- matcher = alphaNumeric.matcher("foo");
}
@Override
Modified: mahout/trunk/pom.xml
URL: http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1346364&r1=1346363&r2=1346364&view=diff
==============================================================================
--- mahout/trunk/pom.xml (original)
+++ mahout/trunk/pom.xml Tue Jun 5 12:11:21 2012
@@ -98,7 +98,7 @@
<maven.clover.multiproject>true</maven.clover.multiproject>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<hadoop.version>0.20.204.0</hadoop.version>
- <lucene.version>3.5.0</lucene.version>
+ <lucene.version>3.6.0</lucene.version>
<mongodb.version>2.5</mongodb.version>
<cassandra.version>0.8.1</cassandra.version>
</properties>
@@ -176,11 +176,6 @@
<artifactId>lucene-benchmark</artifactId>
<version>${lucene.version}</version>
</dependency>
- <dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-commons-csv</artifactId>
- <version>${lucene.version}</version>
- </dependency>
<dependency>
<groupId>junit</groupId>