You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by gs...@apache.org on 2010/08/23 16:45:41 UTC
svn commit: r988137 - in /lucene/dev/branches/branch_3x/solr: ./
contrib/clustering/ contrib/clustering/lib/
contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/
contrib/clustering/src/test/java/org/apache/solr/handler/clusterin...
Author: gsingers
Date: Mon Aug 23 14:45:40 2010
New Revision: 988137
URL: http://svn.apache.org/viewvc?rev=988137&view=rev
Log:
SOLR-1804: upgraded to latest version of Carrot2
Added:
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar (with props)
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java (with props)
lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar (with props)
Removed:
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-mini-3.1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/ehcache-1.6.2.jar
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-0.9.9-6.jar
lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-0.9.9-6.jar
lucene/dev/branches/branch_3x/solr/lib/google-collect-1.0.jar
Modified:
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/LICENSE.txt
lucene/dev/branches/branch_3x/solr/NOTICE.txt
lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Mon Aug 23 14:45:40 2010
@@ -445,6 +445,8 @@ Other Changes
* SOLR-2003: SolrResourceLoader will report any encoding errors, rather than
silently using replacement characters for invalid inputs (blargy via rmuir)
+* SOLR-1804: Google collections updated to Google Guava (which is a superset of collections and contains bug fixes) (gsingers)
+
Build
----------------------
Modified: lucene/dev/branches/branch_3x/solr/LICENSE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/LICENSE.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/LICENSE.txt (original)
+++ lucene/dev/branches/branch_3x/solr/LICENSE.txt Mon Aug 23 14:45:40 2010
@@ -778,26 +778,9 @@ ANY THEORY OF LIABILITY, WHETHER IN
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-==========================================================================
-EHCache
-/**
- * Copyright 2003-2008 Luck Consulting Pty Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
==========================================================================
-Google Collections
+Guava
/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Modified: lucene/dev/branches/branch_3x/solr/NOTICE.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/NOTICE.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/NOTICE.txt (original)
+++ lucene/dev/branches/branch_3x/solr/NOTICE.txt Mon Aug 23 14:45:40 2010
@@ -148,7 +148,7 @@ Copyright (c) 2000-2005 INRIA, France Te
=========================================================================
== Carrot2 Notice ==
=========================================================================
-Copyright (C) 2002-2008, Dawid Weiss, Stanislaw Osinski.
+Copyright (C) 2002-2010, Dawid Weiss, Stanislaw Osinski.
Portions (C) Contributors listed in "carrot2.CONTRIBUTORS" file.
All rights reserved.
@@ -156,24 +156,16 @@ This product includes software developed
See http://project.carrot2.org/
-=========================================================================
-== EHCache Notice ==
-=========================================================================
-Copyright 2003-2008 Luck Consulting Pty Ltd
-
-This product includes software developed by the EHCache Project
-
-See ????
=========================================================================
-== Google Collections Notice ==
+== Guava Notice ==
=========================================================================
Copyright ???? Google, Inc.
-This product includes software developed by the Google Collections project.
+This product includes software developed by the Google Guava project.
-See ????
+See http://code.google.com/p/guava-libraries/
=========================================================================
== Jackson Notice ==
@@ -182,7 +174,7 @@ Copyright ????
This product includes software developed by the Jackson project.
-See ????
+See http://jackson.codehaus.org/
=========================================================================
== HSQLDB Notice ==
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/CHANGES.txt Mon Aug 23 14:45:40 2010
@@ -8,12 +8,15 @@ CHANGES
$Id:$
-================== Release 1.5-dev ==================
+================== Release XXXX ==================
* SOLR-1684: Switch to use the SolrIndexSearcher.doc(int, Set<String>) method b/c it can use the document cache (gsingers)
* SOLR-1692: Fix bug relating to carrot.produceSummary option (gsingers)
+* SOLR-1804: Re-enabled clustering on trunk, updated to latest version of Carrot2. No more LGPL run-time dependencies.
+ This release of C2 also does not have a specific Lucene dependency. (Stanislaw Osinski, gsingers)
+
================== Release 1.4.0 ==================
Solr Clustering will be released for the first time in Solr 1.4. See http://wiki.apache.org/solr/ClusteringComponent
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/build.xml Mon Aug 23 14:45:40 2010
@@ -27,12 +27,10 @@
Clustering Integraton
</description>
- <property name="download.dir" value="lib/downloads"/>
<property name="example.local" value="example"/>
<path id="common.classpath">
<fileset dir="lib"/>
- <fileset dir="${download.dir}"/>
<pathelement location="${solr-path}/build/solr"/>
<pathelement location="${solr-path}/build/solrj"/>
<path refid="lucene.classpath"/>
@@ -56,51 +54,22 @@
-->
<delete dir="example/lib" />
</target>
- <target name="clean-downloads">
- <delete>
- <fileset dir="${download.dir}"/>
- </delete>
- </target>
+
<target name="init">
<mkdir dir="${dest}/classes"/>
- <mkdir dir="${download.dir}" />
+
<mkdir dir="${build.javadoc}"/>
- <ant dir="../../" inheritall="false" target="compileTests"/> <!-- compiles src and tests -->
- <ant dir="../../" inheritall="false" target="make-manifest"/>
- </target>
-
- <target name="check-files" depends="proxy.setup">
- <available file="${download.dir}/colt-1.2.0.jar" property="colt.exists"/>
- <available file="${download.dir}/pcj-1.2.jar" property="pcj.exists"/>
- <available file="${download.dir}/nni-1.0.0.jar" property="nni.exists"/>
- <available file="${download.dir}/simple-xml-1.7.3.jar" property="simplexml.exists"/>
- </target>
- <!-- http://mirrors.ibiblio.org/pub/mirrors/maven2/org/simpleframework/simple-xml/1.7.3/simple-xml-1.7.3.jar -->
- <target name="get-colt" depends="check-files" unless="colt.exists">
- <!-- Get the LGPL deps and put them in a separate dir -->
- <get src="http://repo1.maven.org/maven2/colt/colt/1.2.0/colt-1.2.0.jar" dest="${download.dir}/colt-1.2.0.jar"/>
- </target>
- <target name="get-nni" depends="check-files" unless="nni.exists">
- <!-- Get the LGPL deps and put them in a separate dir -->
- <get src="http://download.carrot2.org/maven2/org/carrot2/nni/1.0.0/nni-1.0.0.jar"
- dest="${download.dir}/nni-1.0.0.jar"/>
+ <subant target="compileTests">
+ <fileset dir="${solr-path}" includes="build.xml"/>
+ </subant>
+ <subant target="make-manifest">
+ <fileset dir="${solr-path}" includes="build.xml"/>
+ </subant>
</target>
- <!-- Compile time dep. only -->
- <target name="get-simple-xml" depends="check-files" unless="simplexml.exists">
- <!-- Get the LGPL deps and put them in a separate dir -->
- <get src="http://mirrors.ibiblio.org/pub/mirrors/maven2/org/simpleframework/simple-xml/1.7.3/simple-xml-1.7.3.jar"
- dest="${download.dir}/simple-xml-1.7.3.jar"/>
- </target>
- <target name="get-pcj" depends="check-files" unless="pcj.exists">
- <!-- Get the LGPL deps and put them in a separate dir -->
- <get src="http://repo1.maven.org/maven2/pcj/pcj/1.2/pcj-1.2.jar" dest="${download.dir}/pcj-1.2.jar"/>
- </target>
- <target name="get-libraries" depends="init, get-colt, get-pcj, get-nni, get-simple-xml"/>
-
- <target name="compile" depends="init, get-libraries">
+ <target name="compile" depends="init">
<solr-javac destdir="${dest}/classes"
classpathref="common.classpath">
<src path="src/main/java"/>
@@ -126,14 +95,9 @@
-->
</target>
- <property name="tempDir" value="${junit.output.dir}/temp" />
<target name="test" depends="compileTests">
<mkdir dir="${junit.output.dir}"/>
- <!-- <mkdir dir="@{tempDir}/@{pattern}"/>
- This is very loud and obnoxious. abuse touch instead for a "quiet" mkdir
- -->
- <touch file="${tempDir}/quiet.ant" verbose="false" mkdirs="true"/>
<junit printsummary="on"
haltonfailure="no"
@@ -141,11 +105,8 @@
errorProperty="tests.failed"
failureProperty="tests.failed"
dir="src/test/resources/"
- tempdir="${tempDir}"
+ tempdir="${junit.output.dir}"
>
- <sysproperty key="jetty.insecurerandom" value="1"/>
- <sysproperty key="tempDir" file="${tempDir}"/>
- <sysproperty key="testmethod" value="${testmethod}"/>
<formatter type="brief" usefile="false" if="junit.details"/>
<classpath refid="test.classpath"/>
<assertions>
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/carrot2-core-3.4.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/commons-lang-2.4.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/hppc-0.3.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-core-asl-1.5.2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/jackson-mapper-asl-1.5.2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/log4j-1.2.14.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-collections-0.3.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/mahout-math-0.3.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/lib/simple-xml-2.3.5.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngine.java Mon Aug 23 14:45:40 2010
@@ -58,7 +58,7 @@ public class CarrotClusteringEngine exte
/**
* Carrot2 controller that manages instances of clustering algorithms
*/
- private CachingController controller = new CachingController();
+ private Controller controller = ControllerFactory.createPooling();
private Class<? extends IClusteringAlgorithm> clusteringAlgorithmClass;
private String idFieldName;
@@ -91,6 +91,12 @@ public class CarrotClusteringEngine exte
// Initialize Carrot2 controller. Pass initialization attributes, if any.
HashMap<String, Object> initAttributes = new HashMap<String, Object>();
extractCarrotAttributes(initParams, initAttributes);
+
+ // Customize the language model factory. The implementation we provide here
+ // is included in the code base of Solr, so that it's possible to refactor
+ // the Lucene APIs the factory relies on if needed.
+ initAttributes.put("PreprocessingPipeline.languageModelFactory",
+ new LuceneLanguageModelFactory());
this.controller.init(initAttributes);
this.idFieldName = core.getSchema().getUniqueKeyField().getName();
Added: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java?rev=988137&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java (added)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java Mon Aug 23 14:45:40 2010
@@ -0,0 +1,353 @@
+package org.apache.solr.handler.clustering.carrot2;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.nio.CharBuffer;
+import java.util.HashMap;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizer;
+import org.apache.lucene.analysis.ar.ArabicStemmer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.carrot2.core.LanguageCode;
+import org.carrot2.text.analysis.ExtendedWhitespaceTokenizer;
+import org.carrot2.text.analysis.ITokenizer;
+import org.carrot2.text.linguistic.BaseLanguageModelFactory;
+import org.carrot2.text.linguistic.IStemmer;
+import org.carrot2.text.linguistic.IdentityStemmer;
+import org.carrot2.text.util.MutableCharArray;
+import org.carrot2.util.ExceptionUtils;
+import org.carrot2.util.ReflectionUtils;
+import org.carrot2.util.attribute.Bindable;
+import org.slf4j.Logger;
+import org.tartarus.snowball.SnowballProgram;
+import org.tartarus.snowball.ext.DanishStemmer;
+import org.tartarus.snowball.ext.DutchStemmer;
+import org.tartarus.snowball.ext.EnglishStemmer;
+import org.tartarus.snowball.ext.FinnishStemmer;
+import org.tartarus.snowball.ext.FrenchStemmer;
+import org.tartarus.snowball.ext.GermanStemmer;
+import org.tartarus.snowball.ext.HungarianStemmer;
+import org.tartarus.snowball.ext.ItalianStemmer;
+import org.tartarus.snowball.ext.NorwegianStemmer;
+import org.tartarus.snowball.ext.PortugueseStemmer;
+import org.tartarus.snowball.ext.RomanianStemmer;
+import org.tartarus.snowball.ext.RussianStemmer;
+import org.tartarus.snowball.ext.SpanishStemmer;
+import org.tartarus.snowball.ext.SwedishStemmer;
+import org.tartarus.snowball.ext.TurkishStemmer;
+
+/**
+ * A Solr-specific language model factory for Carrot2. This factory is the only
+ * element in Carrot2 that depends on Lucene APIs, so should the APIs need to
+ * change, the changes can be made in this class.
+ */
+@Bindable(prefix = "DefaultLanguageModelFactory")
+public class LuceneLanguageModelFactory extends BaseLanguageModelFactory {
+ final static Logger logger = org.slf4j.LoggerFactory
+ .getLogger(LuceneLanguageModelFactory.class);
+
+ /**
+ * Provide an {@link IStemmer} implementation for a given language.
+ */
+ protected IStemmer createStemmer(LanguageCode language) {
+ switch (language) {
+ case ARABIC:
+ return ArabicStemmerFactory.createStemmer();
+
+ case CHINESE_SIMPLIFIED:
+ return IdentityStemmer.INSTANCE;
+
+ default:
+ /*
+ * For other languages, try to use snowball's stemming.
+ */
+ return SnowballStemmerFactory.createStemmer(language);
+ }
+ }
+
+ @Override
+ protected ITokenizer createTokenizer(LanguageCode language) {
+ switch (language) {
+ case CHINESE_SIMPLIFIED:
+ return ChineseTokenizerFactory.createTokenizer();
+
+ /*
+ * We use our own analyzer for Arabic. Lucene's version has special
+ * support for Nonspacing-Mark characters (see
+ * http://www.fileformat.info/info/unicode/category/Mn/index.htm), but we
+ * have them included as letters in the parser.
+ */
+ case ARABIC:
+ // Intentional fall-through.
+
+ default:
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations from the <code>snowball</code>
+ * project.
+ */
+ private final static class SnowballStemmerFactory {
+ /**
+ * Static hard mapping from language codes to stemmer classes in Snowball.
+ * This mapping is not dynamic because we want to keep the possibility to
+ * obfuscate these classes.
+ */
+ private static HashMap<LanguageCode, Class<? extends SnowballProgram>> snowballStemmerClasses;
+ static {
+ snowballStemmerClasses = new HashMap<LanguageCode, Class<? extends SnowballProgram>>();
+ snowballStemmerClasses.put(LanguageCode.DANISH, DanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.DUTCH, DutchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ENGLISH, EnglishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FINNISH, FinnishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.FRENCH, FrenchStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.GERMAN, GermanStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.HUNGARIAN, HungarianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ITALIAN, ItalianStemmer.class);
+ snowballStemmerClasses
+ .put(LanguageCode.NORWEGIAN, NorwegianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.PORTUGUESE,
+ PortugueseStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.ROMANIAN, RomanianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.RUSSIAN, RussianStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SPANISH, SpanishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.SWEDISH, SwedishStemmer.class);
+ snowballStemmerClasses.put(LanguageCode.TURKISH, TurkishStemmer.class);
+ }
+
+ /**
+ * An adapter converting Snowball programs into {@link IStemmer} interface.
+ */
+ private static class SnowballStemmerAdapter implements IStemmer {
+ private final SnowballProgram snowballStemmer;
+
+ public SnowballStemmerAdapter(SnowballProgram snowballStemmer) {
+ this.snowballStemmer = snowballStemmer;
+ }
+
+ public CharSequence stem(CharSequence word) {
+ snowballStemmer.setCurrent(word.toString());
+ if (snowballStemmer.stem()) {
+ return snowballStemmer.getCurrent();
+ } else {
+ return null;
+ }
+ }
+ }
+
+ /**
+ * Create and return an {@link IStemmer} adapter for a
+ * {@link SnowballProgram} for a given language code. An identity stemmer is
+ * returned for unknown languages.
+ */
+ public static IStemmer createStemmer(LanguageCode language) {
+ final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
+ .get(language);
+
+ if (stemmerClazz == null) {
+ logger.warn("No Snowball stemmer class for: " + language.name()
+ + ". Quality of clustering may be degraded.");
+ return IdentityStemmer.INSTANCE;
+ }
+
+ try {
+ return new SnowballStemmerAdapter(stemmerClazz.newInstance());
+ } catch (Exception e) {
+ logger.warn("Could not instantiate snowball stemmer"
+ + " for language: " + language.name()
+ + ". Quality of clustering may be degraded.", e);
+
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * Factory of {@link IStemmer} implementations for the
+ * {@link LanguageCode#ARABIC} language. Requires <code>lucene-contrib</code>
+ * to be present in classpath, otherwise an empty (identity) stemmer is
+ * returned.
+ */
+ private static class ArabicStemmerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(ArabicStemmer.class.getName(), false);
+ ReflectionUtils.classForName(ArabicNormalizer.class.getName(), false);
+ } catch (ClassNotFoundException e) {
+ logger
+ .warn(
+ "Could not instantiate Lucene stemmer for Arabic, clustering quality "
+ + "of Chinese content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Arabic analyzer JAR is in the classpath",
+ e);
+ }
+ }
+
+ /**
+ * Adapter to lucene-contrib Arabic analyzers.
+ */
+ private static class LuceneStemmerAdapter implements IStemmer {
+ private final org.apache.lucene.analysis.ar.ArabicStemmer delegate;
+ private final org.apache.lucene.analysis.ar.ArabicNormalizer normalizer;
+
+ private char[] buffer = new char[0];
+
+ private LuceneStemmerAdapter() throws Exception {
+ delegate = new org.apache.lucene.analysis.ar.ArabicStemmer();
+ normalizer = new org.apache.lucene.analysis.ar.ArabicNormalizer();
+ }
+
+ public CharSequence stem(CharSequence word) {
+ if (word.length() > buffer.length) {
+ buffer = new char[word.length()];
+ }
+
+ for (int i = 0; i < word.length(); i++) {
+ buffer[i] = word.charAt(i);
+ }
+
+ int newLen = normalizer.normalize(buffer, word.length());
+ newLen = delegate.stem(buffer, newLen);
+
+ if (newLen != word.length() || !equals(buffer, newLen, word)) {
+ return CharBuffer.wrap(buffer, 0, newLen);
+ }
+
+ // Same-same.
+ return null;
+ }
+
+ private boolean equals(char[] buffer, int len, CharSequence word) {
+ assert len == word.length();
+
+ for (int i = 0; i < len; i++) {
+ if (buffer[i] != word.charAt(i))
+ return false;
+ }
+
+ return true;
+ }
+ }
+
+ public static IStemmer createStemmer() {
+ try {
+ return new LuceneStemmerAdapter();
+ } catch (Throwable e) {
+ return IdentityStemmer.INSTANCE;
+ }
+ }
+ }
+
+ /**
+ * Creates tokenizers that adapt Lucene's Smart Chinese Tokenizer to Carrot2's
+ * {@link ITokenizer}. If Smart Chinese is not available in the classpath, the
+ * factory will fall back to the default white space tokenizer.
+ */
+ private static final class ChineseTokenizerFactory {
+ static {
+ try {
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ ReflectionUtils.classForName(
+ "org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ } catch (Throwable e) {
+ logger
+ .warn("Could not instantiate Smart Chinese Analyzer, clustering quality "
+ + "of Chinese content may be degraded. For best quality clusters, "
+ + "make sure Lucene's Smart Chinese Analyzer JAR is in the classpath");
+ }
+ }
+
+ static ITokenizer createTokenizer() {
+ try {
+ return new ChineseTokenizer();
+ } catch (Throwable e) {
+ return new ExtendedWhitespaceTokenizer();
+ }
+ }
+
+ private final static class ChineseTokenizer implements ITokenizer {
+ private final static Pattern numeric = Pattern
+ .compile("[\\-+'$]?\\d+([:\\-/,.]?\\d+)*[%$]?");
+
+ private Tokenizer sentenceTokenizer;
+ private TokenStream wordTokenFilter;
+ private CharTermAttribute term = null;
+
+ private final MutableCharArray tempCharSequence;
+ private final Class<?> tokenFilterClass;
+
+ private ChineseTokenizer() throws Exception {
+ this.tempCharSequence = new MutableCharArray(new char[0]);
+
+ // As Smart Chinese is not available during compile time,
+ // we need to resort to reflection.
+ final Class<?> tokenizerClass = ReflectionUtils
+ .classForName("org.apache.lucene.analysis.cn.smart.SentenceTokenizer", false);
+ this.sentenceTokenizer = (Tokenizer) tokenizerClass.getConstructor(
+ Reader.class).newInstance((Reader) null);
+ this.tokenFilterClass = ReflectionUtils
+ .classForName("org.apache.lucene.analysis.cn.smart.WordTokenFilter", false);
+ }
+
+ public short nextToken() throws IOException {
+ final boolean hasNextToken = wordTokenFilter.incrementToken();
+ if (hasNextToken) {
+ short flags = 0;
+ final char[] image = term.buffer();
+ final int length = term.length();
+ tempCharSequence.reset(image, 0, length);
+ if (length == 1 && image[0] == ',') {
+ // ChineseTokenizer seems to convert all punctuation to ','
+ // characters
+ flags = ITokenizer.TT_PUNCTUATION;
+ } else if (numeric.matcher(tempCharSequence).matches()) {
+ flags = ITokenizer.TT_NUMERIC;
+ } else {
+ flags = ITokenizer.TT_TERM;
+ }
+ return flags;
+ }
+
+ return ITokenizer.TT_EOF;
+ }
+
+ public void setTermBuffer(MutableCharArray array) {
+ array.reset(term.buffer(), 0, term.length());
+ }
+
+ public void reset(Reader input) throws IOException {
+ try {
+ sentenceTokenizer.reset(input);
+ wordTokenFilter = (TokenStream) tokenFilterClass.getConstructor(
+ TokenStream.class).newInstance(sentenceTokenizer);
+ } catch (Exception e) {
+ throw ExceptionUtils.wrapAsRuntimeException(e);
+ }
+ }
+ }
+ }
+}
Propchange: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/main/java/org/apache/solr/handler/clustering/carrot2/LuceneLanguageModelFactory.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/clustering/src/test/java/org/apache/solr/handler/clustering/carrot2/CarrotClusteringEngineTest.java Mon Aug 23 14:45:40 2010
@@ -46,7 +46,10 @@ import static org.junit.Assert.*;
public class CarrotClusteringEngineTest extends AbstractClusteringTest {
@Test
public void testCarrotLingo() throws Exception {
- checkEngine(getClusteringEngine("default"), 10);
+ // Note: the expected number of clusters may change after upgrading Carrot2
+ // due to e.g. internal improvements or tuning of Carrot2 clustering.
+ final int expectedNumClusters = 10;
+ checkEngine(getClusteringEngine("default"), expectedNumClusters);
}
@Test
@@ -54,7 +57,11 @@ public class CarrotClusteringEngineTest
ModifiableSolrParams solrParams = new ModifiableSolrParams();
solrParams.add(CarrotParams.SNIPPET_FIELD_NAME, "snippet");
solrParams.add(CarrotParams.SUMMARY_FRAGSIZE, "200");//how do we validate this?
- checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, 15, new TermQuery(new Term("snippet", "mine")), solrParams);
+
+ // Note: the expected number of clusters may change after upgrading Carrot2
+ // due to e.g. internal improvements or tuning of Carrot2 clustering.
+ final int expectedNumClusters = 15;
+ checkEngine(getClusteringEngine("default"), numberOfDocs -2 /*two don't have mining in the snippet*/, expectedNumClusters, new TermQuery(new Term("snippet", "mine")), solrParams);
}
@Test
Modified: lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml?rev=988137&r1=988136&r2=988137&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml (original)
+++ lucene/dev/branches/branch_3x/solr/example/solr/conf/solrconfig.xml Mon Aug 23 14:45:40 2010
@@ -64,7 +64,6 @@
<!-- If a dir option (with or without a regex) is used and nothing is found
that matches, it will be ignored
-->
- <lib dir="../../contrib/clustering/lib/downloads/" />
<lib dir="../../contrib/clustering/lib/" />
<lib dir="/total/crap/dir/ignored" />
<!-- an exact path can be used to specify a specific file. This will cause
@@ -799,6 +798,12 @@
parameter name and attribute value as parameter value.
-->
<str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
+
+ <!--
+ The language to assume for the documents. For a list of allowed values, see:
+ http://download.carrot2.org/stable/manual/#section.attribute.lingo.MultilingualClustering.defaultLanguage
+ -->
+ <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
</lst>
<lst name="engine">
<str name="name">stc</str>
Added: lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar?rev=988137&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/dev/branches/branch_3x/solr/lib/guava-r05.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream