You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/08/23 16:45:41 UTC

svn commit: r988137 - in /lucene/dev/branches/branch_3x/solr: ./ contrib/clustering/ contrib/clustering/lib/ contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/ contrib/clustering/src/test/java/org/apache/solr/handler/clusterin...

Author: gsingers
Date: Mon Aug 23 14:45:40 2010
New Revision: 988137

URL: http://svn.apache.org/viewvc?rev=988137&view=rev
Log:
SOLR-1804: upgraded to latest version of Carrot2

Added:
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar   (with props)
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java   (with props)
    lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar   (with props)
Removed:
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/ehcache-1.6.2.jar
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar
    lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar
    lucene/dev/branches/branch_3x/solr/lib/google-collect-1.0.jar
Modified:
    lucene/dev/branches/branch_3x/solr/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/LICENSE.txt
    lucene/dev/branches/branch_3x/solr/NOTICE.txt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
    lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
    lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml

Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon Aug 23 14:45:40 2010
@@ -445,6 +445,8 @@ Other Changes
 * SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
   silently using replacement characters for invalid inputs (blargy via rmuir)
 
+* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)  
+
 Build
 ----------------------
 

Modified: lucene/dev/branches/branch_3x/solr/LICENSE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/LICENSE.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/LICENSE.txt (original)
+++ lucene/dev/branches/branch_3x/solr/LICENSE.txt Mon Aug 23 14:45:40 2010
@@ -778,26 +778,9 @@ ANY  THEORY  OF  LIABILITY,  WHETHER  IN
 (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY  OUT OF THE USE  OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-==========================================================================
-EHCache
-/**
- *  Copyright 2003-2008 Luck Consulting Pty Ltd
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
 
 ==========================================================================
-Google Collections
+Guava
 /**
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.

Modified: lucene/dev/branches/branch_3x/solr/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/NOTICE.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/NOTICE.txt (original)
+++ lucene/dev/branches/branch_3x/solr/NOTICE.txt Mon Aug 23 14:45:40 2010
@@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Te
 =========================================================================
 ==     Carrot2 Notice                                                  ==
 =========================================================================
-Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski.
+Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
 Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
 All rights reserved.
 
@@ -156,24 +156,16 @@ This product includes software developed
 
 See http://project.carrot2.org/
 
-=========================================================================
-==     EHCache Notice                                                  ==
-=========================================================================
-Copyright 2003-2008 Luck Consulting Pty Ltd
-
-This product includes software developed by the EHCache Project
-
-See ????
 
 =========================================================================
-==     Google Collections Notice                                       ==
+==     Guava Notice                                                    ==
 =========================================================================
 
 Copyright ???? Google, Inc.
 
-This product includes software developed by the Google Collections project.
+This product includes software developed by the Google Guava project.
 
-See ????
+See http://code.google.com/p/guava-libraries/
 
 =========================================================================
 ==     Jackson Notice                                                  ==
@@ -182,7 +174,7 @@ Copyright ????
 
 This product includes software developed by the Jackson project.
 
-See ????
+See http://jackson.codehaus.org/
 
 =========================================================================
 ==     HSQLDB Notice                                                   ==

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt Mon Aug 23 14:45:40 2010
@@ -8,12 +8,15 @@ CHANGES
 
 $Id:$
 
-================== Release 1.5-dev ==================
+================== Release XXXX ==================
 
 * SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
 
 * SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
 
+* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2.  No more LGPL run-time dependencies.
+  This release of C2 also does not have a specific Lucene dependency.  (Stanislaw Osinski, gsingers)
+
 ================== Release 1.4.0 ==================
 
 Solr Clustering will be released for the first time in Solr 1.4.  See http://wiki.apache.org/solr/ClusteringComponent

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml Mon Aug 23 14:45:40 2010
@@ -27,12 +27,10 @@
     Clustering Integraton
   </description>
 
-  <property name="download.dir" value="lib/downloads"/>
   <property name="example.local" value="example"/>
   
   <path id="common.classpath">
     <fileset dir="lib"/>
-    <fileset dir="${download.dir}"/>
     <pathelement location="${solr-path}/build/solr"/>
     <pathelement location="${solr-path}/build/solrj"/>
     <path refid="lucene.classpath"/>
@@ -56,51 +54,22 @@
       -->
     <delete dir="example/lib" />
   </target>
-  <target name="clean-downloads">
-    <delete>
-      <fileset dir="${download.dir}"/>
-    </delete>
-  </target>
+
 
   <target name="init">
     <mkdir dir="${dest}/classes"/>
-    <mkdir dir="${download.dir}" />
+    
     <mkdir dir="${build.javadoc}"/>
-    <ant dir="../../" inheritall="false" target="compileTests"/> <!-- compiles src and tests -->
-    <ant dir="../../" inheritall="false" target="make-manifest"/>
-  </target>
-
-  <target name="check-files" depends="proxy.setup">
-    <available file="${download.dir}/colt-1.2.0.jar" property="colt.exists"/>
-    <available file="${download.dir}/pcj-1.2.jar" property="pcj.exists"/>
-    <available file="${download.dir}/nni-1.0.0.jar" property="nni.exists"/>
-    <available file="${download.dir}/simple-xml-1.7.3.jar" property="simplexml.exists"/>
-  </target>
-  <!-- http://mirrors.ibiblio.org/pub/mirrors/maven2/org/simpleframework/simple-xml/1.7.3/simple-xml-1.7.3.jar -->
-  <target name="get-colt" depends="check-files" unless="colt.exists">
-    <!-- Get the LGPL deps and put them in a separate dir -->
-    <get src="http://repo1.maven.org/maven2/colt/colt/1.2.0/colt-1.2.0.jar" dest="${download.dir}/colt-1.2.0.jar"/>
-  </target>
-  <target name="get-nni" depends="check-files" unless="nni.exists">
-    <!-- Get the LGPL deps and put them in a separate dir -->
-    <get src="http://download.carrot2.org/maven2/org/carrot2/nni/1.0.0/nni-1.0.0.jar"
-         dest="${download.dir}/nni-1.0.0.jar"/>
+    <subant target="compileTests">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
+    <subant target="make-manifest">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
   </target>
 
-  <!-- Compile time dep. only -->
-  <target name="get-simple-xml" depends="check-files" unless="simplexml.exists">
-    <!-- Get the LGPL deps and put them in a separate dir -->
-    <get src="http://mirrors.ibiblio.org/pub/mirrors/maven2/org/simpleframework/simple-xml/1.7.3/simple-xml-1.7.3.jar"
-         dest="${download.dir}/simple-xml-1.7.3.jar"/>
 
-  </target>
-  <target name="get-pcj" depends="check-files" unless="pcj.exists">
-    <!-- Get the LGPL deps and put them in a separate dir -->
-    <get src="http://repo1.maven.org/maven2/pcj/pcj/1.2/pcj-1.2.jar" dest="${download.dir}/pcj-1.2.jar"/>
-  </target>
-  <target name="get-libraries" depends="init, get-colt, get-pcj, get-nni, get-simple-xml"/>
-
-  <target name="compile" depends="init, get-libraries">
+  <target name="compile" depends="init">
     <solr-javac destdir="${dest}/classes"
                 classpathref="common.classpath">
       <src path="src/main/java"/>
@@ -126,14 +95,9 @@
       -->
   </target>
 
-  <property name="tempDir" value="${junit.output.dir}/temp" />
 
   <target name="test" depends="compileTests">
     <mkdir dir="${junit.output.dir}"/>
-    <!-- <mkdir dir="@{tempDir}/@{pattern}"/> 
-       This is very loud and obnoxious. abuse touch instead for a "quiet" mkdir
-    -->
-  	<touch file="${tempDir}/quiet.ant" verbose="false" mkdirs="true"/>
 
     <junit printsummary="on"
            haltonfailure="no"
@@ -141,11 +105,8 @@
            errorProperty="tests.failed"
            failureProperty="tests.failed"
            dir="src/test/resources/"
-           tempdir="${tempDir}"
+           tempdir="${junit.output.dir}"
             >
-      <sysproperty key="jetty.insecurerandom" value="1"/>
-      <sysproperty key="tempDir" file="${tempDir}"/>
-      <sysproperty key="testmethod" value="${testmethod}"/>
       <formatter type="brief" usefile="false" if="junit.details"/>
       <classpath refid="test.classpath"/>
       <assertions>

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon Aug 23 14:45:40 2010
@@ -58,7 +58,7 @@ public class CarrotClusteringEngine exte
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
-  private CachingController controller = new CachingController();
+  private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
 
   private String idFieldName;
@@ -91,6 +91,12 @@ public class CarrotClusteringEngine exte
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
+    
+    // Customize the language model factory. The implementation we provide here
+    // is included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factory relies on if needed.
+    initAttributes.put("PreprocessingPipeline.languageModelFactory",
+      new LuceneLanguageModelFactory());
     this.controller.init(initAttributes);
 
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();

Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java?rev=988137&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java Mon Aug 23 14:45:40 2010
@@ -0,0 +1,353 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.HashMap;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizer;
+import org.apache.lucene.analysis.ar.ArabicStemmer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.BaseLanguageModelFactory;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IdentityStemmer;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.carrot2.util.attribute.Bindable;
+import org.slf4j.Logger;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * A Solr-specific language model factory for Carrot2. This factory is the only
+ * element in Carrot2 that depends on Lucene APIs, so should the APIs need to
+ * change, the changes can be made in this class.
+ */
+@Bindable(prefix = "DefaultLanguageModelFactory")
+public class LuceneLanguageModelFactory extends BaseLanguageModelFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(LuceneLanguageModelFactory.class);
+
+	/**
+	 * Provide an {@link IStemmer} implementation for a given language.
+	 */
+	protected IStemmer createStemmer(LanguageCode language) {
+		switch (language) {
+		case ARABIC:
+			return ArabicStemmerFactory.createStemmer();
+
+		case CHINESE_SIMPLIFIED:
+			return IdentityStemmer.INSTANCE;
+
+		default:
+			/*
+			 * For other languages, try to use snowball's stemming.
+			 */
+			return SnowballStemmerFactory.createStemmer(language);
+		}
+	}
+
+	@Override
+	protected ITokenizer createTokenizer(LanguageCode language) {
+		switch (language) {
+		case CHINESE_SIMPLIFIED:
+			return ChineseTokenizerFactory.createTokenizer();
+
+			/*
+			 * We use our own analyzer for Arabic. Lucene's version has special
+			 * support for Nonspacing-Mark characters (see
+			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+			 * have them included as letters in the parser.
+			 */
+		case ARABIC:
+			// Intentional fall-through.
+
+		default:
+			return new ExtendedWhitespaceTokenizer();
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+	 * project.
+	 */
+	private final static class SnowballStemmerFactory {
+		/**
+		 * Static hard mapping from language codes to stemmer classes in Snowball.
+		 * This mapping is not dynamic because we want to keep the possibility to
+		 * obfuscate these classes.
+		 */
+		private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+		static {
+			snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+			snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+					PortugueseStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+		}
+
+		/**
+		 * An adapter converting Snowball programs into {@link IStemmer} interface.
+		 */
+		private static class SnowballStemmerAdapter implements IStemmer {
+			private final SnowballProgram snowballStemmer;
+
+			public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+				this.snowballStemmer = snowballStemmer;
+			}
+
+			public CharSequence stem(CharSequence word) {
+				snowballStemmer.setCurrent(word.toString());
+				if (snowballStemmer.stem()) {
+					return snowballStemmer.getCurrent();
+				} else {
+					return null;
+				}
+			}
+		}
+
+		/**
+		 * Create and return an {@link IStemmer} adapter for a
+		 * {@link SnowballProgram} for a given language code. An identity stemmer is
+		 * returned for unknown languages.
+		 */
+		public static IStemmer createStemmer(LanguageCode language) {
+			final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+					.get(language);
+
+			if (stemmerClazz == null) {
+				logger.warn("No Snowball stemmer class for: " + language.name()
+						+ ". Quality of clustering may be degraded.");
+				return IdentityStemmer.INSTANCE;
+			}
+
+			try {
+				return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+			} catch (Exception e) {
+				logger.warn("Could not instantiate snowball stemmer"
+						+ " for language: " + language.name()
+						+ ". Quality of clustering may be degraded.", e);
+
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations for the
+	 * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+	 * to be present in classpath, otherwise an empty (identity) stemmer is
+	 * returned.
+	 */
+	private static class ArabicStemmerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+				ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+			} catch (ClassNotFoundException e) {
+				logger
+						.warn(
+								"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+										+ "of Chinese content may be degraded. For best quality clusters, "
+										+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
+								e);
+			}
+		}
+
+		/**
+		 * Adapter to lucene-contrib Arabic analyzers.
+		 */
+		private static class LuceneStemmerAdapter implements IStemmer {
+			private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+			private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+			private char[] buffer = new char[0];
+
+			private LuceneStemmerAdapter() throws Exception {
+				delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+				normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+			}
+
+			public CharSequence stem(CharSequence word) {
+				if (word.length() > buffer.length) {
+					buffer = new char[word.length()];
+				}
+
+				for (int i = 0; i < word.length(); i++) {
+					buffer[i] = word.charAt(i);
+				}
+
+				int newLen = normalizer.normalize(buffer, word.length());
+				newLen = delegate.stem(buffer, newLen);
+
+				if (newLen != word.length() || !equals(buffer, newLen, word)) {
+					return CharBuffer.wrap(buffer, 0, newLen);
+				}
+
+				// Same-same.
+				return null;
+			}
+
+			private boolean equals(char[] buffer, int len, CharSequence word) {
+				assert len == word.length();
+
+				for (int i = 0; i < len; i++) {
+					if (buffer[i] != word.charAt(i))
+						return false;
+				}
+
+				return true;
+			}
+		}
+
+		public static IStemmer createStemmer() {
+			try {
+				return new LuceneStemmerAdapter();
+			} catch (Throwable e) {
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+	 * factory will fall back to the default white space tokenizer.
+	 */
+	private static final class ChineseTokenizerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+			} catch (Throwable e) {
+				logger
+						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+								+ "of Chinese content may be degraded. For best quality clusters, "
+								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+			}
+		}
+
+		static ITokenizer createTokenizer() {
+			try {
+				return new ChineseTokenizer();
+			} catch (Throwable e) {
+				return new ExtendedWhitespaceTokenizer();
+			}
+		}
+
+		private final static class ChineseTokenizer implements ITokenizer {
+			private final static Pattern numeric = Pattern
+					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+			private Tokenizer sentenceTokenizer;
+			private TokenStream wordTokenFilter;
+			private CharTermAttribute term = null;
+
+			private final MutableCharArray tempCharSequence;
+			private final Class<?> tokenFilterClass;
+
+			private ChineseTokenizer() throws Exception {
+				this.tempCharSequence = new MutableCharArray(new char[0]);
+
+				// As Smart Chinese is not available during compile time,
+				// we need to resort to reflection.
+				final Class<?> tokenizerClass = ReflectionUtils
+						.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+						Reader.class).newInstance((Reader) null);
+				this.tokenFilterClass = ReflectionUtils
+						.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+			}
+
+			public short nextToken() throws IOException {
+				final boolean hasNextToken = wordTokenFilter.incrementToken();
+				if (hasNextToken) {
+					short flags = 0;
+					final char[] image = term.buffer();
+					final int length = term.length();
+					tempCharSequence.reset(image, 0, length);
+					if (length == 1 && image[0] == ',') {
+						// ChineseTokenizer seems to convert all punctuation to ','
+						// characters
+						flags = ITokenizer.TT_PUNCTUATION;
+					} else if (numeric.matcher(tempCharSequence).matches()) {
+						flags = ITokenizer.TT_NUMERIC;
+					} else {
+						flags = ITokenizer.TT_TERM;
+					}
+					return flags;
+				}
+
+				return ITokenizer.TT_EOF;
+			}
+
+			public void setTermBuffer(MutableCharArray array) {
+				array.reset(term.buffer(), 0, term.length());
+			}
+
+			public void reset(Reader input) throws IOException {
+				try {
+					sentenceTokenizer.reset(input);
+					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+							TokenStream.class).newInstance(sentenceTokenizer);
+				} catch (Exception e) {
+					throw ExceptionUtils.wrapAsRuntimeException(e);
+				}
+			}
+		}
+	}
+}

Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon Aug 23 14:45:40 2010
@@ -46,7 +46,10 @@ import static org.junit.Assert.*;
 public class CarrotClusteringEngineTest extends AbstractClusteringTest {
   @Test
   public void testCarrotLingo() throws Exception {
-    checkEngine(getClusteringEngine("default"), 10);
+  	// Note: the expected number of clusters may change after upgrading Carrot2
+  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    final int expectedNumClusters = 10;
+		checkEngine(getClusteringEngine("default"), expectedNumClusters);
   }
 
   @Test
@@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams solrParams = new ModifiableSolrParams();
     solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
     solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
-    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams);
+    
+  	// Note: the expected number of clusters may change after upgrading Carrot2
+  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    final int expectedNumClusters = 15;
+    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
   }
 
   @Test

Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml Mon Aug 23 14:45:40 2010
@@ -64,7 +64,6 @@
   <!-- If a dir option (with or without a regex) is used and nothing is found
        that matches, it will be ignored
     -->
-  <lib dir="../../contrib/clustering/lib/downloads/" />
   <lib dir="../../contrib/clustering/lib/" />
   <lib dir="/total/crap/dir/ignored" /> 
   <!-- an exact path can be used to specify a specific file.  This will cause
@@ -799,6 +798,12 @@
            parameter name and attribute value as parameter value.
         -->
       <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+      
+      <!--
+           The language to assume for the documents. For a list of allowed values, see:
+           http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+       -->
+      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
     </lst>
     <lst name="engine">
       <str name="name">stc</str>

Added: lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream