You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/08/23 16:24:01 UTC

svn commit: r988129 - in /lucene/dev/trunk/solr: ./ contrib/clustering/ contrib/clustering/lib/ contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/ contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/ ex...

Author: gsingers
Date: Mon Aug 23 14:24:00 2010
New Revision: 988129

URL: http://svn.apache.org/viewvc?rev=988129&view=rev
Log:
SOLR-1804: Re-integrated Carrot2

Added:
    lucene/dev/trunk/solr/contrib/clustering/build.xml   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/commons-lang-2.4.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/hppc-0.3.1.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/log4j-1.2.14.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/mahout-collections-0.3.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/mahout-math-0.3.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/lib/simple-xml-2.3.5.jar   (with props)
    lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java   (with props)
    lucene/dev/trunk/solr/lib/guava-r05.jar   (with props)
Removed:
    lucene/dev/trunk/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar
    lucene/dev/trunk/solr/contrib/clustering/lib/ehcache-1.6.2.jar
    lucene/dev/trunk/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar
    lucene/dev/trunk/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar
    lucene/dev/trunk/solr/lib/google-collect-1.0.jar
Modified:
    lucene/dev/trunk/solr/CHANGES.txt
    lucene/dev/trunk/solr/LICENSE.txt
    lucene/dev/trunk/solr/NOTICE.txt
    lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
    lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
    lucene/dev/trunk/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
    lucene/dev/trunk/solr/example/solr/conf/solrconfig.xml

Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Aug 23 14:24:00 2010
@@ -234,7 +234,7 @@ New Features
 
 * SOLR-1682: (SOLR-236, SOLR-237, SOLR-1773, SOLR-1311) Search grouping / Field collapsing.
   (Martijn van Groningen, Emmanuel Keller, Shalin Shekhar Mangar,
-   Koji Sekiguchi, Iván de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
+   Koji Sekiguchi, Iv�n de Prado, Ryan McKinley, Marc Sturlese, Peter Karich,
    Bojan Smid, Charles Hornberger, Dieter Grad, Dmitry Lihachev, Doug Steigerwald,
    Karsten Sperling, Michael Gundlach, Oleg Gnatovskiy, Thomas Traeger, yonik)
 
@@ -521,6 +521,8 @@ Other Changes
 * SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
   silently using replacement characters for invalid inputs (blargy via rmuir)
 
+* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)  
+
 Build
 ----------------------
 

Modified: lucene/dev/trunk/solr/LICENSE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/LICENSE.txt?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/LICENSE.txt (original)
+++ lucene/dev/trunk/solr/LICENSE.txt Mon Aug 23 14:24:00 2010
@@ -778,26 +778,9 @@ ANY  THEORY  OF  LIABILITY,  WHETHER  IN
 (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY  OUT OF THE USE  OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-==========================================================================
-EHCache
-/**
- *  Copyright 2003-2008 Luck Consulting Pty Ltd
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
 
 ==========================================================================
-Google Collections
+Guava
 /**
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.

Modified: lucene/dev/trunk/solr/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/NOTICE.txt?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/NOTICE.txt (original)
+++ lucene/dev/trunk/solr/NOTICE.txt Mon Aug 23 14:24:00 2010
@@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Te
 =========================================================================
 ==     Carrot2 Notice                                                  ==
 =========================================================================
-Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski.
+Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
 Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
 All rights reserved.
 
@@ -156,24 +156,16 @@ This product includes software developed
 
 See http://project.carrot2.org/
 
-=========================================================================
-==     EHCache Notice                                                  ==
-=========================================================================
-Copyright 2003-2008 Luck Consulting Pty Ltd
-
-This product includes software developed by the EHCache Project
-
-See ????
 
 =========================================================================
-==     Google Collections Notice                                       ==
+==     Guava Notice                                                    ==
 =========================================================================
 
 Copyright ???? Google, Inc.
 
-This product includes software developed by the Google Collections project.
+This product includes software developed by the Google Guava project.
 
-See ????
+See http://code.google.com/p/guava-libraries/
 
 =========================================================================
 ==     Jackson Notice                                                  ==
@@ -182,7 +174,7 @@ Copyright ????
 
 This product includes software developed by the Jackson project.
 
-See ????
+See http://jackson.codehaus.org/
 
 =========================================================================
 ==     HSQLDB Notice                                                   ==

Modified: lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/contrib/clustering/CHANGES.txt Mon Aug 23 14:24:00 2010
@@ -8,12 +8,15 @@ CHANGES
 
 $Id:$
 
-================== Release 1.5-dev ==================
+================== Release XXXX ==================
 
 * SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
 
 * SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
 
+* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2.  No more LGPL run-time dependencies.
+  This release of C2 also does not have a specific Lucene dependency.  (Stanislaw Osinski, gsingers)
+
 ================== Release 1.4.0 ==================
 
 Solr Clustering will be released for the first time in Solr 1.4.  See http://wiki.apache.org/solr/ClusteringComponent

Added: lucene/dev/trunk/solr/contrib/clustering/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/build.xml?rev=988129&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/build.xml (added)
+++ lucene/dev/trunk/solr/contrib/clustering/build.xml Mon Aug 23 14:24:00 2010
@@ -0,0 +1,160 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="solr-clustering" default="build">
+
+  <property name="solr-path" value="../.."/>
+
+  <import file="../../common-build.xml"/>
+
+  <description>
+    Clustering Integraton
+  </description>
+
+  <property name="example.local" value="example"/>
+  
+  <path id="common.classpath">
+    <fileset dir="lib"/>
+    <pathelement location="${solr-path}/build/solr"/>
+    <pathelement location="${solr-path}/build/solrj"/>
+    <path refid="lucene.classpath"/>
+    <fileset dir="${solr-path}/lib" includes="*.jar"/>
+  </path>
+
+  <path id="test.classpath">
+    <pathelement path="${dest}/classes"/>
+    <pathelement path="${dest}/test-classes"/>
+    <pathelement path="${java.class.path}"/>
+    <pathelement location="${common-solr.dir}/build/tests"/> <!-- include solr test code -->
+    <pathelement location="${common-solr.dir}/../lucene/build/classes/test" />  <!-- include some lucene test code -->
+    <path refid="common.classpath"/>
+  </path>
+
+  <target name="clean">
+    <delete failonerror="false" dir="${dest}"/>
+
+    <!-- example doesn't create this anymore, but clean it up
+         if it's still there from an old build
+      -->
+    <delete dir="example/lib" />
+  </target>
+
+
+  <target name="init">
+    <mkdir dir="${dest}/classes"/>
+    
+    <mkdir dir="${build.javadoc}"/>
+    <subant target="compileTests">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
+    <subant target="make-manifest">
+      <fileset dir="${solr-path}" includes="build.xml"/>
+    </subant>
+  </target>
+
+
+  <target name="compile" depends="init">
+    <solr-javac destdir="${dest}/classes"
+                classpathref="common.classpath">
+      <src path="src/main/java"/>
+    </solr-javac>
+  </target>
+
+  <target name="build" depends="compile">
+    <solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
+              manifest="../../${dest}/META-INF/MANIFEST.MF"/>
+  </target>
+
+  <target name="compileTests" depends="compile">
+    <solr-javac destdir="${dest}/test-classes"
+                classpathref="test.classpath">
+      <src path="src/test/java"/>
+    </solr-javac>
+  </target>
+
+  <target name="example" depends="build,dist">
+    <!-- this task use to copy lib's but that's no longer needed because
+         ../lib and ../lib/downloads are now included explicitly by
+         example/conf/solrconfig.xml
+      -->
+  </target>
+
+
+  <target name="test" depends="compileTests">
+    <mkdir dir="${junit.output.dir}"/>
+
+    <junit printsummary="on"
+           haltonfailure="no"
+           maxmemory="512M"
+           errorProperty="tests.failed"
+           failureProperty="tests.failed"
+           dir="src/test/resources/"
+           tempdir="${junit.output.dir}"
+            >
+      <formatter type="brief" usefile="false" if="junit.details"/>
+      <classpath refid="test.classpath"/>
+      <assertions>
+        <enable package="org.apache.lucene"/>
+        <enable package="org.apache.solr"/>
+      </assertions>
+      <formatter type="xml"/>
+      <batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
+        <fileset dir="src/test/java" includes="${junit.includes}">
+          <exclude name="**/AbstractClusteringTest*"/>
+        </fileset>
+      </batchtest>
+      <batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
+        <fileset dir="src/test/java" includes="**/${testcase}.java"/>
+      </batchtest>
+    </junit>
+
+    <fail if="tests.failed">Tests failed!</fail>
+  </target>
+
+  <target name="dist" depends="build">
+    <!--
+      <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/build/web/WEB-INF/lib"/>
+      <copy todir="${solr-path}/build/web/WEB-INF/lib" flatten="true">
+        <fileset dir="lib">
+          <include name="**/*.jar"/>
+        </fileset>
+      </copy>
+    -->
+    <copy file="${dest}/${fullnamever}.jar" todir="${solr-path}/dist"/>
+  </target>
+
+  <target name="javadoc">
+    <sequential>
+      <mkdir dir="${build.javadoc}/contrib-${name}"/>
+
+      <path id="javadoc.classpath">
+        <path refid="common.classpath"/>
+      </path>
+
+      <invoke-javadoc
+              destdir="${build.javadoc}/contrib-${name}"
+              title="${Name} ${version} contrib-${fullnamever} API">
+        <sources>
+          <packageset dir="src/main/java"/>
+        </sources>
+      </invoke-javadoc>
+    </sequential>
+  </target>
+
+</project>

Propchange: lucene/dev/trunk/solr/contrib/clustering/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/dev/trunk/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/commons-lang-2.4.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/commons-lang-2.4.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/commons-lang-2.4.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/hppc-0.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/hppc-0.3.1.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/hppc-0.3.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/log4j-1.2.14.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/log4j-1.2.14.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/log4j-1.2.14.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/mahout-collections-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/mahout-collections-0.3.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/mahout-collections-0.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/mahout-math-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/mahout-math-0.3.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/mahout-math-0.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/dev/trunk/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/lib/simple-xml-2.3.5.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon Aug 23 14:24:00 2010
@@ -58,7 +58,7 @@ public class CarrotClusteringEngine exte
   /**
    * Carrot2 controller that manages instances of clustering algorithms
    */
-  private CachingController controller = new CachingController();
+  private Controller controller = ControllerFactory.createPooling();
   private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
 
   private String idFieldName;
@@ -91,6 +91,12 @@ public class CarrotClusteringEngine exte
     // Initialize Carrot2 controller. Pass initialization attributes, if any.
     HashMap<String, Object> initAttributes = new HashMap<String, Object>();
     extractCarrotAttributes(initParams, initAttributes);
+    
+    // Customize the language model factory. The implementation we provide here
+    // is included in the code base of Solr, so that it's possible to refactor
+    // the Lucene APIs the factory relies on if needed.
+    initAttributes.put("PreprocessingPipeline.languageModelFactory",
+      new LuceneLanguageModelFactory());
     this.controller.init(initAttributes);
 
     this.idFieldName = core.getSchema().getUniqueKeyField().getName();

Added: lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java?rev=988129&view=auto
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java (added)
+++ lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java Mon Aug 23 14:24:00 2010
@@ -0,0 +1,353 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.HashMap;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizer;
+import org.apache.lucene.analysis.ar.ArabicStemmer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.BaseLanguageModelFactory;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IdentityStemmer;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.carrot2.util.attribute.Bindable;
+import org.slf4j.Logger;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * A Solr-specific language model factory for Carrot2. This factory is the only
+ * element in Carrot2 that depends on Lucene APIs, so should the APIs need to
+ * change, the changes can be made in this class.
+ */
+@Bindable(prefix = "DefaultLanguageModelFactory")
+public class LuceneLanguageModelFactory extends BaseLanguageModelFactory {
+	final static Logger logger = org.slf4j.LoggerFactory
+			.getLogger(LuceneLanguageModelFactory.class);
+
+	/**
+	 * Provide an {@link IStemmer} implementation for a given language.
+	 */
+	protected IStemmer createStemmer(LanguageCode language) {
+		switch (language) {
+		case ARABIC:
+			return ArabicStemmerFactory.createStemmer();
+
+		case CHINESE_SIMPLIFIED:
+			return IdentityStemmer.INSTANCE;
+
+		default:
+			/*
+			 * For other languages, try to use snowball's stemming.
+			 */
+			return SnowballStemmerFactory.createStemmer(language);
+		}
+	}
+
+	@Override
+	protected ITokenizer createTokenizer(LanguageCode language) {
+		switch (language) {
+		case CHINESE_SIMPLIFIED:
+			return ChineseTokenizerFactory.createTokenizer();
+
+			/*
+			 * We use our own analyzer for Arabic. Lucene's version has special
+			 * support for Nonspacing-Mark characters (see
+			 * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+			 * have them included as letters in the parser.
+			 */
+		case ARABIC:
+			// Intentional fall-through.
+
+		default:
+			return new ExtendedWhitespaceTokenizer();
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+	 * project.
+	 */
+	private final static class SnowballStemmerFactory {
+		/**
+		 * Static hard mapping from language codes to stemmer classes in Snowball.
+		 * This mapping is not dynamic because we want to keep the possibility to
+		 * obfuscate these classes.
+		 */
+		private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+		static {
+			snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+			snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+			snowballStemmerClasses
+					.put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+					PortugueseStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+			snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+		}
+
+		/**
+		 * An adapter converting Snowball programs into {@link IStemmer} interface.
+		 */
+		private static class SnowballStemmerAdapter implements IStemmer {
+			private final SnowballProgram snowballStemmer;
+
+			public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+				this.snowballStemmer = snowballStemmer;
+			}
+
+			public CharSequence stem(CharSequence word) {
+				snowballStemmer.setCurrent(word.toString());
+				if (snowballStemmer.stem()) {
+					return snowballStemmer.getCurrent();
+				} else {
+					return null;
+				}
+			}
+		}
+
+		/**
+		 * Create and return an {@link IStemmer} adapter for a
+		 * {@link SnowballProgram} for a given language code. An identity stemmer is
+		 * returned for unknown languages.
+		 */
+		public static IStemmer createStemmer(LanguageCode language) {
+			final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+					.get(language);
+
+			if (stemmerClazz == null) {
+				logger.warn("No Snowball stemmer class for: " + language.name()
+						+ ". Quality of clustering may be degraded.");
+				return IdentityStemmer.INSTANCE;
+			}
+
+			try {
+				return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+			} catch (Exception e) {
+				logger.warn("Could not instantiate snowball stemmer"
+						+ " for language: " + language.name()
+						+ ". Quality of clustering may be degraded.", e);
+
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * Factory of {@link IStemmer} implementations for the
+	 * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+	 * to be present in classpath, otherwise an empty (identity) stemmer is
+	 * returned.
+	 */
+	private static class ArabicStemmerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+				ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+			} catch (ClassNotFoundException e) {
+				logger
+						.warn(
+								"Could not instantiate Lucene stemmer for Arabic, clustering quality "
+										+ "of Chinese content may be degraded. For best quality clusters, "
+										+ "make sure Lucene's Arabic analyzer JAR is in the classpath",
+								e);
+			}
+		}
+
+		/**
+		 * Adapter to lucene-contrib Arabic analyzers.
+		 */
+		private static class LuceneStemmerAdapter implements IStemmer {
+			private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+			private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+			private char[] buffer = new char[0];
+
+			private LuceneStemmerAdapter() throws Exception {
+				delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+				normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+			}
+
+			public CharSequence stem(CharSequence word) {
+				if (word.length() > buffer.length) {
+					buffer = new char[word.length()];
+				}
+
+				for (int i = 0; i < word.length(); i++) {
+					buffer[i] = word.charAt(i);
+				}
+
+				int newLen = normalizer.normalize(buffer, word.length());
+				newLen = delegate.stem(buffer, newLen);
+
+				if (newLen != word.length() || !equals(buffer, newLen, word)) {
+					return CharBuffer.wrap(buffer, 0, newLen);
+				}
+
+				// Same-same.
+				return null;
+			}
+
+			private boolean equals(char[] buffer, int len, CharSequence word) {
+				assert len == word.length();
+
+				for (int i = 0; i < len; i++) {
+					if (buffer[i] != word.charAt(i))
+						return false;
+				}
+
+				return true;
+			}
+		}
+
+		public static IStemmer createStemmer() {
+			try {
+				return new LuceneStemmerAdapter();
+			} catch (Throwable e) {
+				return IdentityStemmer.INSTANCE;
+			}
+		}
+	}
+
+	/**
+	 * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+	 * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+	 * factory will fall back to the default white space tokenizer.
+	 */
+	private static final class ChineseTokenizerFactory {
+		static {
+			try {
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+				ReflectionUtils.classForName(
+						"org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+			} catch (Throwable e) {
+				logger
+						.warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+								+ "of Chinese content may be degraded. For best quality clusters, "
+								+ "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+			}
+		}
+
+		static ITokenizer createTokenizer() {
+			try {
+				return new ChineseTokenizer();
+			} catch (Throwable e) {
+				return new ExtendedWhitespaceTokenizer();
+			}
+		}
+
+		private final static class ChineseTokenizer implements ITokenizer {
+			private final static Pattern numeric = Pattern
+					.compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+			private Tokenizer sentenceTokenizer;
+			private TokenStream wordTokenFilter;
+			private CharTermAttribute term = null;
+
+			private final MutableCharArray tempCharSequence;
+			private final Class<?> tokenFilterClass;
+
+			private ChineseTokenizer() throws Exception {
+				this.tempCharSequence = new MutableCharArray(new char[0]);
+
+				// As Smart Chinese is not available during compile time,
+				// we need to resort to reflection.
+				final Class<?> tokenizerClass = ReflectionUtils
+						.classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+				this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+						Reader.class).newInstance((Reader) null);
+				this.tokenFilterClass = ReflectionUtils
+						.classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+			}
+
+			public short nextToken() throws IOException {
+				final boolean hasNextToken = wordTokenFilter.incrementToken();
+				if (hasNextToken) {
+					short flags = 0;
+					final char[] image = term.buffer();
+					final int length = term.length();
+					tempCharSequence.reset(image, 0, length);
+					if (length == 1 && image[0] == ',') {
+						// ChineseTokenizer seems to convert all punctuation to ','
+						// characters
+						flags = ITokenizer.TT_PUNCTUATION;
+					} else if (numeric.matcher(tempCharSequence).matches()) {
+						flags = ITokenizer.TT_NUMERIC;
+					} else {
+						flags = ITokenizer.TT_TERM;
+					}
+					return flags;
+				}
+
+				return ITokenizer.TT_EOF;
+			}
+
+			public void setTermBuffer(MutableCharArray array) {
+				array.reset(term.buffer(), 0, term.length());
+			}
+
+			public void reset(Reader input) throws IOException {
+				try {
+					sentenceTokenizer.reset(input);
+					wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+							TokenStream.class).newInstance(sentenceTokenizer);
+				} catch (Exception e) {
+					throw ExceptionUtils.wrapAsRuntimeException(e);
+				}
+			}
+		}
+	}
+}

Propchange: lucene/dev/trunk/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/dev/trunk/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/trunk/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon Aug 23 14:24:00 2010
@@ -46,7 +46,10 @@ import static org.junit.Assert.*;
 public class CarrotClusteringEngineTest extends AbstractClusteringTest {
   @Test
   public void testCarrotLingo() throws Exception {
-    checkEngine(getClusteringEngine("default"), 10);
+  	// Note: the expected number of clusters may change after upgrading Carrot2
+  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    final int expectedNumClusters = 10;
+		checkEngine(getClusteringEngine("default"), expectedNumClusters);
   }
 
   @Test
@@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest 
     ModifiableSolrParams solrParams = new ModifiableSolrParams();
     solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
     solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
-    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams);
+    
+  	// Note: the expected number of clusters may change after upgrading Carrot2
+  	// due to e.g. internal improvements or tuning of Carrot2 clustering.
+    final int expectedNumClusters = 15;
+    checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
   }
 
   @Test

Modified: lucene/dev/trunk/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/example/solr/conf/solrconfig.xml?rev=988129&r1=988128&r2=988129&view=diff
==============================================================================
--- lucene/dev/trunk/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/trunk/solr/example/solr/conf/solrconfig.xml Mon Aug 23 14:24:00 2010
@@ -56,7 +56,6 @@
   <!-- If a dir option (with or without a regex) is used and nothing is found
        that matches, it will be ignored
     -->
-  <lib dir="../../contrib/clustering/lib/downloads/" />
   <lib dir="../../contrib/clustering/lib/" />
   <lib dir="/total/crap/dir/ignored" /> 
   <!-- an exact path can be used to specify a specific file.  This will cause
@@ -808,6 +807,12 @@
            parameter name and attribute value as parameter value.
         -->
       <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+      
+      <!--
+           The language to assume for the documents. For a list of allowed values, see:
+           http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+       -->
+      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
     </lst>
     <lst name="engine">
       <str name="name">stc</str>

Added: lucene/dev/trunk/solr/lib/guava-r05.jar
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/lib/guava-r05.jar?rev=988129&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/dev/trunk/solr/lib/guava-r05.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream